{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.464523281596452, "eval_steps": 500, "global_step": 12500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 510.5, "epoch": 0.0002771618625277162, "grad_norm": 0.2483018934726715, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "reward": 3.09375, "reward_std": 2.9250621795654297, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 1 }, { "completion_length": 496.25, "epoch": 0.0005543237250554324, "grad_norm": 0.2826499938964844, "kl": 0.0, "learning_rate": 5.0000000000000004e-08, "loss": 0.0, "reward": 3.375, "reward_std": 1.973786473274231, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 2 }, { "completion_length": 471.0, "epoch": 0.0008314855875831486, "grad_norm": 0.33350881934165955, "kl": 1.128939948102925e-05, "learning_rate": 1.0000000000000001e-07, "loss": 0.0, "reward": 3.59375, "reward_std": 1.9721366167068481, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 3 }, { "completion_length": 596.25, "epoch": 0.0011086474501108647, "grad_norm": 0.2136853039264679, "kl": 1.0410139111627359e-05, "learning_rate": 1.5000000000000002e-07, "loss": 0.0, "reward": 1.5625, "reward_std": 2.1127095222473145, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.5625, "step": 4 }, { "completion_length": 508.25, "epoch": 0.001385809312638581, "grad_norm": 0.2845478057861328, "kl": 8.62709384819027e-06, "learning_rate": 2.0000000000000002e-07, "loss": -0.0, "reward": 4.09375, "reward_std": 1.5724681615829468, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 5 }, { "completion_length": 433.75, "epoch": 0.0016629711751662971, "grad_norm": 0.3396788537502289, "kl": 1.2766749023285229e-05, "learning_rate": 2.5000000000000004e-07, "loss": -0.0, "reward": 2.4375, "reward_std": 1.0680004358291626, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 6 }, { "completion_length": 495.25, "epoch": 0.0019401330376940134, "grad_norm": 0.25669044256210327, "kl": 8.092667485470884e-06, "learning_rate": 3.0000000000000004e-07, "loss": 0.0, "reward": 3.625, "reward_std": 1.314977765083313, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 7 }, { "completion_length": 519.25, "epoch": 0.0022172949002217295, "grad_norm": 0.25402092933654785, "kl": 9.882345693768002e-06, "learning_rate": 3.5000000000000004e-07, "loss": -0.0, "reward": 3.46875, "reward_std": 1.8325045108795166, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 8 }, { "completion_length": 600.75, "epoch": 0.0024944567627494456, "grad_norm": 0.2685031592845917, "kl": 1.0373846635047812e-05, "learning_rate": 4.0000000000000003e-07, "loss": -0.0, "reward": 4.1875, "reward_std": 1.8385342359542847, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.5625, "step": 9 }, { "completion_length": 468.25, "epoch": 0.002771618625277162, "grad_norm": 0.24542361497879028, "kl": 1.0986384950228967e-05, "learning_rate": 4.5000000000000003e-07, "loss": 0.0, "reward": 4.0, "reward_std": 1.5, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10 }, { "completion_length": 525.25, "epoch": 0.003048780487804878, "grad_norm": 0.0, "kl": 8.427283319178969e-06, "learning_rate": 5.000000000000001e-07, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11 }, { "completion_length": 439.75, "epoch": 0.0033259423503325942, "grad_norm": 0.2493726760149002, "kl": 9.617193427402526e-06, "learning_rate": 5.5e-07, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12 }, { "completion_length": 587.25, "epoch": 0.0036031042128603103, "grad_norm": 0.2199113667011261, "kl": 9.088103979593143e-06, "learning_rate": 6.000000000000001e-07, "loss": 0.0, "reward": 3.84375, "reward_std": 1.8125, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 13 }, { "completion_length": 539.5, "epoch": 0.003880266075388027, "grad_norm": 0.281586617231369, "kl": 1.0656050108082127e-05, "learning_rate": 6.5e-07, "loss": 0.0, "reward": 3.0625, "reward_std": 1.7455300092697144, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5625, "step": 14 }, { "completion_length": 584.5, "epoch": 0.0041574279379157425, "grad_norm": 0.2644045054912567, "kl": 1.0303749149898067e-05, "learning_rate": 7.000000000000001e-07, "loss": -0.0, "reward": 3.53125, "reward_std": 1.6341887712478638, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.65625, "step": 15 }, { "completion_length": 500.25, "epoch": 0.004434589800443459, "grad_norm": 0.278541624546051, "kl": 1.111542951548472e-05, "learning_rate": 7.5e-07, "loss": -0.0, "reward": 2.90625, "reward_std": 1.907701015472412, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 16 }, { "completion_length": 516.25, "epoch": 0.0047117516629711755, "grad_norm": 0.25131115317344666, "kl": 1.036299090628745e-05, "learning_rate": 8.000000000000001e-07, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 17 }, { "completion_length": 572.75, "epoch": 0.004988913525498891, "grad_norm": 0.2191542536020279, "kl": 9.720729394757655e-06, "learning_rate": 8.500000000000001e-07, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 18 }, { "completion_length": 562.75, "epoch": 0.005266075388026608, "grad_norm": 0.18628911674022675, "kl": 8.783076737017836e-06, "learning_rate": 9.000000000000001e-07, "loss": -0.0, "reward": 1.71875, "reward_std": 0.46069467067718506, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 19 }, { "completion_length": 589.75, "epoch": 0.005543237250554324, "grad_norm": 0.22150404751300812, "kl": 8.758663170738146e-06, "learning_rate": 9.500000000000001e-07, "loss": 0.0, "reward": 5.15625, "reward_std": 0.702488124370575, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 20 }, { "completion_length": 527.25, "epoch": 0.00582039911308204, "grad_norm": 0.2421361654996872, "kl": 9.786639566300437e-06, "learning_rate": 1.0000000000000002e-06, "loss": -0.0, "reward": 2.3125, "reward_std": 1.125, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 21 }, { "completion_length": 580.0, "epoch": 0.006097560975609756, "grad_norm": 0.22485709190368652, "kl": 1.0484103768249042e-05, "learning_rate": 1.0500000000000001e-06, "loss": -0.0, "reward": 2.375, "reward_std": 0.9464846849441528, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 22 }, { "completion_length": 544.0, "epoch": 0.006374722838137472, "grad_norm": 0.24875161051750183, "kl": 9.304120794695336e-06, "learning_rate": 1.1e-06, "loss": 0.0, "reward": 3.96875, "reward_std": 1.1831059455871582, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 23 }, { "completion_length": 552.5, "epoch": 0.0066518847006651885, "grad_norm": 0.2850598990917206, "kl": 1.168019025499234e-05, "learning_rate": 1.1500000000000002e-06, "loss": -0.0, "reward": 4.34375, "reward_std": 1.483854055404663, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 24 }, { "completion_length": 548.5, "epoch": 0.006929046563192905, "grad_norm": 0.23308464884757996, "kl": 9.858715202426538e-06, "learning_rate": 1.2000000000000002e-06, "loss": 0.0, "reward": 2.71875, "reward_std": 1.6213902235031128, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 25 }, { "completion_length": 521.5, "epoch": 0.007206208425720621, "grad_norm": 0.35765984654426575, "kl": 9.292946742789354e-06, "learning_rate": 1.25e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 26 }, { "completion_length": 457.0, "epoch": 0.007483370288248337, "grad_norm": 0.2953888475894928, "kl": 8.89795228431467e-06, "learning_rate": 1.3e-06, "loss": 0.0, "reward": 3.71875, "reward_std": 2.3460404872894287, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 27 }, { "completion_length": 444.75, "epoch": 0.007760532150776054, "grad_norm": 0.3193739354610443, "kl": 1.0488542102393694e-05, "learning_rate": 1.3500000000000002e-06, "loss": 0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 28 }, { "completion_length": 582.0, "epoch": 0.00803769401330377, "grad_norm": 0.2760341763496399, "kl": 9.948931619874202e-06, "learning_rate": 1.4000000000000001e-06, "loss": 0.0, "reward": 1.0625, "reward_std": 2.4590394496917725, "rewards/confident_score_func": -0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.4375, "step": 29 }, { "completion_length": 488.25, "epoch": 0.008314855875831485, "grad_norm": 0.28203949332237244, "kl": 9.830992894421797e-06, "learning_rate": 1.45e-06, "loss": 0.0, "reward": 2.8125, "reward_std": 1.983000636100769, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 30 }, { "completion_length": 559.25, "epoch": 0.008592017738359202, "grad_norm": 0.21860253810882568, "kl": 9.808512913878076e-06, "learning_rate": 1.5e-06, "loss": 0.0, "reward": 3.6875, "reward_std": 1.983000636100769, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 31 }, { "completion_length": 517.0, "epoch": 0.008869179600886918, "grad_norm": 0.22798267006874084, "kl": 8.656654244987294e-06, "learning_rate": 1.5500000000000002e-06, "loss": 0.0, "reward": 2.71875, "reward_std": 2.670001268386841, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 32 }, { "completion_length": 512.75, "epoch": 0.009146341463414634, "grad_norm": 0.2716781198978424, "kl": 1.1083876415796112e-05, "learning_rate": 1.6000000000000001e-06, "loss": 0.0, "reward": 3.5, "reward_std": 1.6863422393798828, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.625, "step": 33 }, { "completion_length": 596.5, "epoch": 0.009423503325942351, "grad_norm": 0.22542887926101685, "kl": 1.0993462637998164e-05, "learning_rate": 1.6500000000000003e-06, "loss": 0.0, "reward": 2.46875, "reward_std": 2.5132460594177246, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 34 }, { "completion_length": 557.5, "epoch": 0.009700665188470067, "grad_norm": 0.30234187841415405, "kl": 9.6288758868468e-06, "learning_rate": 1.7000000000000002e-06, "loss": -0.0, "reward": 3.0625, "reward_std": 1.625, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 35 }, { "completion_length": 620.0, "epoch": 0.009977827050997782, "grad_norm": 0.23311540484428406, "kl": 9.361262527818326e-06, "learning_rate": 1.75e-06, "loss": -0.0, "reward": 3.375, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 36 }, { "completion_length": 461.75, "epoch": 0.0102549889135255, "grad_norm": 0.2609017491340637, "kl": 1.1284730135230348e-05, "learning_rate": 1.8000000000000001e-06, "loss": -0.0, "reward": 2.59375, "reward_std": 1.4626424312591553, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 37 }, { "completion_length": 596.25, "epoch": 0.010532150776053215, "grad_norm": 0.216621533036232, "kl": 1.2903798051411286e-05, "learning_rate": 1.85e-06, "loss": -0.0, "reward": 4.09375, "reward_std": 1.6998008489608765, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 38 }, { "completion_length": 533.25, "epoch": 0.010809312638580931, "grad_norm": 0.2331874668598175, "kl": 1.4122239917924162e-05, "learning_rate": 1.9000000000000002e-06, "loss": -0.0, "reward": 1.84375, "reward_std": 2.2989468574523926, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 39 }, { "completion_length": 531.25, "epoch": 0.011086474501108648, "grad_norm": 0.30514734983444214, "kl": 1.0896302228502464e-05, "learning_rate": 1.9500000000000004e-06, "loss": 0.0, "reward": 2.9375, "reward_std": 3.089801788330078, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.5625, "step": 40 }, { "completion_length": 605.75, "epoch": 0.011363636363636364, "grad_norm": 0.20663057267665863, "kl": 1.2109178896935191e-05, "learning_rate": 2.0000000000000003e-06, "loss": -0.0, "reward": 1.90625, "reward_std": 2.2017393112182617, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.65625, "step": 41 }, { "completion_length": 627.75, "epoch": 0.01164079822616408, "grad_norm": 0.2208492010831833, "kl": 1.239549874298973e-05, "learning_rate": 2.05e-06, "loss": -0.0, "reward": 1.34375, "reward_std": 0.8125, "rewards/confident_score_func": -0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 42 }, { "completion_length": 541.0, "epoch": 0.011917960088691795, "grad_norm": 0.27659934759140015, "kl": 1.2113577213312965e-05, "learning_rate": 2.1000000000000002e-06, "loss": -0.0, "reward": 3.375, "reward_std": 1.973786473274231, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 43 }, { "completion_length": 416.75, "epoch": 0.012195121951219513, "grad_norm": 0.3077487647533417, "kl": 1.4389873285836075e-05, "learning_rate": 2.15e-06, "loss": 0.0, "reward": 4.0, "reward_std": 2.020725965499878, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 44 }, { "completion_length": 515.5, "epoch": 0.012472283813747228, "grad_norm": 0.3328346610069275, "kl": 1.4842245036561508e-05, "learning_rate": 2.2e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 45 }, { "completion_length": 534.5, "epoch": 0.012749445676274944, "grad_norm": 0.25916367769241333, "kl": 1.5466106560779735e-05, "learning_rate": 2.25e-06, "loss": -0.0, "reward": 2.9375, "reward_std": 1.8860783576965332, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 46 }, { "completion_length": 568.75, "epoch": 0.013026607538802661, "grad_norm": 0.24128328263759613, "kl": 1.1646696293610148e-05, "learning_rate": 2.3000000000000004e-06, "loss": 0.0, "reward": 5.34375, "reward_std": 0.8125, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 47 }, { "completion_length": 490.5, "epoch": 0.013303769401330377, "grad_norm": 0.29966527223587036, "kl": 1.3090644642943516e-05, "learning_rate": 2.35e-06, "loss": 0.0, "reward": 2.4375, "reward_std": 1.1569535732269287, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.6875, "step": 48 }, { "completion_length": 449.75, "epoch": 0.013580931263858093, "grad_norm": 0.3126159608364105, "kl": 1.4555795132764615e-05, "learning_rate": 2.4000000000000003e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 49 }, { "completion_length": 492.75, "epoch": 0.01385809312638581, "grad_norm": 0.2463827133178711, "kl": 1.843028985604178e-05, "learning_rate": 2.4500000000000003e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 50 }, { "completion_length": 555.25, "epoch": 0.014135254988913526, "grad_norm": 0.3274115324020386, "kl": 3.26520275848452e-05, "learning_rate": 2.5e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 51 }, { "completion_length": 507.5, "epoch": 0.014412416851441241, "grad_norm": 0.23893342912197113, "kl": 1.802389851945918e-05, "learning_rate": 2.55e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 52 }, { "completion_length": 483.75, "epoch": 0.014689578713968959, "grad_norm": 0.2833217978477478, "kl": 2.2152598830871284e-05, "learning_rate": 2.6e-06, "loss": -0.0, "reward": 2.53125, "reward_std": 2.9709970951080322, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.53125, "step": 53 }, { "completion_length": 497.0, "epoch": 0.014966740576496674, "grad_norm": 0.24128924310207367, "kl": 3.548898530425504e-05, "learning_rate": 2.6500000000000005e-06, "loss": -0.0, "reward": 1.59375, "reward_std": 0.3125, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 54 }, { "completion_length": 480.25, "epoch": 0.01524390243902439, "grad_norm": 0.27789512276649475, "kl": 2.1756190108135343e-05, "learning_rate": 2.7000000000000004e-06, "loss": -0.0, "reward": 2.6875, "reward_std": 2.814582586288452, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.5625, "step": 55 }, { "completion_length": 572.25, "epoch": 0.015521064301552107, "grad_norm": 0.30793288350105286, "kl": 2.261531335534528e-05, "learning_rate": 2.7500000000000004e-06, "loss": -0.0, "reward": 3.46875, "reward_std": 1.715052843093872, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 56 }, { "completion_length": 543.75, "epoch": 0.01579822616407982, "grad_norm": 0.32060179114341736, "kl": 3.444459071033634e-05, "learning_rate": 2.8000000000000003e-06, "loss": -0.0, "reward": 3.71875, "reward_std": 2.2738893032073975, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 57 }, { "completion_length": 977.5, "epoch": 0.01607538802660754, "grad_norm": 0.25619077682495117, "kl": 2.6989517209585756e-05, "learning_rate": 2.85e-06, "loss": -0.0, "reward": 2.09375, "reward_std": 1.8967489004135132, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 58 }, { "completion_length": 596.5, "epoch": 0.016352549889135256, "grad_norm": 0.42059677839279175, "kl": 3.6239616747479886e-05, "learning_rate": 2.9e-06, "loss": 0.0, "reward": 3.53125, "reward_std": 3.0023863315582275, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.40625, "step": 59 }, { "completion_length": 493.75, "epoch": 0.01662971175166297, "grad_norm": 0.24205096065998077, "kl": 3.4313568903598934e-05, "learning_rate": 2.95e-06, "loss": -0.0, "reward": 4.6875, "reward_std": 1.6630167961120605, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 60 }, { "completion_length": 436.25, "epoch": 0.016906873614190687, "grad_norm": 0.29289618134498596, "kl": 4.696770338341594e-05, "learning_rate": 3e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 61 }, { "completion_length": 655.25, "epoch": 0.017184035476718405, "grad_norm": 0.333635538816452, "kl": 4.950295988237485e-05, "learning_rate": 3.05e-06, "loss": 0.0, "reward": 4.90625, "reward_std": 0.5896238088607788, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 62 }, { "completion_length": 548.25, "epoch": 0.01746119733924612, "grad_norm": 0.25839337706565857, "kl": 4.438423638930544e-05, "learning_rate": 3.1000000000000004e-06, "loss": -0.0, "reward": 3.5625, "reward_std": 2.534141778945923, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.6875, "step": 63 }, { "completion_length": 527.75, "epoch": 0.017738359201773836, "grad_norm": 0.24760793149471283, "kl": 5.6190947361756116e-05, "learning_rate": 3.1500000000000003e-06, "loss": -0.0, "reward": 2.03125, "reward_std": 2.821744203567505, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.53125, "step": 64 }, { "completion_length": 527.25, "epoch": 0.018015521064301553, "grad_norm": 0.2859019637107849, "kl": 4.132985122851096e-05, "learning_rate": 3.2000000000000003e-06, "loss": -0.0, "reward": 3.1875, "reward_std": 1.8271676301956177, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.5625, "step": 65 }, { "completion_length": 491.25, "epoch": 0.018292682926829267, "grad_norm": 0.3017843961715698, "kl": 8.138444536598399e-05, "learning_rate": 3.2500000000000002e-06, "loss": -0.0, "reward": 3.59375, "reward_std": 2.5028629302978516, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 66 }, { "completion_length": 587.0, "epoch": 0.018569844789356985, "grad_norm": 0.24489502608776093, "kl": 8.795899339020252e-05, "learning_rate": 3.3000000000000006e-06, "loss": -0.0, "reward": 4.5625, "reward_std": 1.980372428894043, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.6875, "step": 67 }, { "completion_length": 538.5, "epoch": 0.018847006651884702, "grad_norm": 0.2539118528366089, "kl": 5.5070027883630246e-05, "learning_rate": 3.3500000000000005e-06, "loss": -0.0, "reward": 3.71875, "reward_std": 2.0113608837127686, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 68 }, { "completion_length": 562.5, "epoch": 0.019124168514412416, "grad_norm": 0.2318730354309082, "kl": 6.305942224571481e-05, "learning_rate": 3.4000000000000005e-06, "loss": 0.0, "reward": 3.78125, "reward_std": 2.2738893032073975, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 69 }, { "completion_length": 513.5, "epoch": 0.019401330376940133, "grad_norm": 0.3258090615272522, "kl": 9.62337726377882e-05, "learning_rate": 3.45e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.658312439918518, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.625, "step": 70 }, { "completion_length": 642.75, "epoch": 0.01967849223946785, "grad_norm": 0.2240474671125412, "kl": 0.00019991242152173072, "learning_rate": 3.5e-06, "loss": -0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 71 }, { "completion_length": 484.5, "epoch": 0.019955654101995565, "grad_norm": 0.0, "kl": 0.0001678902772255242, "learning_rate": 3.5500000000000003e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 72 }, { "completion_length": 541.5, "epoch": 0.020232815964523282, "grad_norm": 0.27008548378944397, "kl": 0.00021189358085393906, "learning_rate": 3.6000000000000003e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 73 }, { "completion_length": 475.75, "epoch": 0.020509977827051, "grad_norm": 0.29425758123397827, "kl": 0.0001920523209264502, "learning_rate": 3.65e-06, "loss": -0.0, "reward": 3.5, "reward_std": 1.443375587463379, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 74 }, { "completion_length": 496.0, "epoch": 0.020787139689578713, "grad_norm": 0.25347745418548584, "kl": 0.0001954682811629027, "learning_rate": 3.7e-06, "loss": -0.0, "reward": 4.375, "reward_std": 1.4930394887924194, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 75 }, { "completion_length": 630.0, "epoch": 0.02106430155210643, "grad_norm": 0.23993411660194397, "kl": 0.0002233159902971238, "learning_rate": 3.7500000000000005e-06, "loss": -0.0, "reward": 4.21875, "reward_std": 1.473286509513855, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 76 }, { "completion_length": 595.75, "epoch": 0.021341463414634148, "grad_norm": 0.2944791913032532, "kl": 0.0001903584779938683, "learning_rate": 3.8000000000000005e-06, "loss": -0.0, "reward": 4.4375, "reward_std": 1.8413649797439575, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 77 }, { "completion_length": 565.75, "epoch": 0.021618625277161862, "grad_norm": 0.32485634088516235, "kl": 0.00011665323108900338, "learning_rate": 3.85e-06, "loss": -0.0, "reward": 4.0625, "reward_std": 1.7245168685913086, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5625, "step": 78 }, { "completion_length": 607.0, "epoch": 0.02189578713968958, "grad_norm": 0.1924661248922348, "kl": 0.0005084489821456373, "learning_rate": 3.900000000000001e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 79 }, { "completion_length": 564.0, "epoch": 0.022172949002217297, "grad_norm": 0.24093343317508698, "kl": 0.00028049247339367867, "learning_rate": 3.95e-06, "loss": -0.0, "reward": 1.5, "reward_std": 0.3061862289905548, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.625, "step": 80 }, { "completion_length": 548.0, "epoch": 0.02245011086474501, "grad_norm": 0.24702483415603638, "kl": 0.00030992651591077447, "learning_rate": 4.000000000000001e-06, "loss": 0.0, "reward": 3.8125, "reward_std": 1.3863170146942139, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5625, "step": 81 }, { "completion_length": 610.0, "epoch": 0.022727272727272728, "grad_norm": 0.2588473856449127, "kl": 0.00017034553457051516, "learning_rate": 4.05e-06, "loss": -0.0, "reward": 2.46875, "reward_std": 1.5889428853988647, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 82 }, { "completion_length": 544.75, "epoch": 0.023004434589800442, "grad_norm": 0.29058149456977844, "kl": 0.00032605425803922117, "learning_rate": 4.1e-06, "loss": 0.0, "reward": 4.0, "reward_std": 1.5, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 83 }, { "completion_length": 573.0, "epoch": 0.02328159645232816, "grad_norm": 0.2425958663225174, "kl": 0.00047077532508410513, "learning_rate": 4.15e-06, "loss": -0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 84 }, { "completion_length": 521.75, "epoch": 0.023558758314855877, "grad_norm": 0.2589661180973053, "kl": 0.0003013776440639049, "learning_rate": 4.2000000000000004e-06, "loss": -0.0, "reward": 3.71875, "reward_std": 1.74515700340271, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 85 }, { "completion_length": 500.25, "epoch": 0.02383592017738359, "grad_norm": 0.26340121030807495, "kl": 0.0006595224840566516, "learning_rate": 4.25e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 86 }, { "completion_length": 560.75, "epoch": 0.024113082039911308, "grad_norm": 0.2269776314496994, "kl": 0.0006194392335601151, "learning_rate": 4.3e-06, "loss": 0.0, "reward": 2.75, "reward_std": 1.3540064096450806, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 87 }, { "completion_length": 575.0, "epoch": 0.024390243902439025, "grad_norm": 0.2649100422859192, "kl": 0.0007867392851039767, "learning_rate": 4.350000000000001e-06, "loss": -0.0, "reward": 4.09375, "reward_std": 1.6998008489608765, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 88 }, { "completion_length": 520.0, "epoch": 0.02466740576496674, "grad_norm": 0.31399402022361755, "kl": 0.0005973342922516167, "learning_rate": 4.4e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 89 }, { "completion_length": 520.75, "epoch": 0.024944567627494457, "grad_norm": 0.22899767756462097, "kl": 0.0006152435671538115, "learning_rate": 4.450000000000001e-06, "loss": 0.0, "reward": 4.46875, "reward_std": 1.9535624980926514, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 90 }, { "completion_length": 679.75, "epoch": 0.025221729490022174, "grad_norm": 0.2924948036670685, "kl": 0.0005268845125101507, "learning_rate": 4.5e-06, "loss": -0.0, "reward": 1.21875, "reward_std": 1.4155939817428589, "rewards/confident_score_func": -0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 91 }, { "completion_length": 499.25, "epoch": 0.025498891352549888, "grad_norm": 0.28107649087905884, "kl": 0.0008572362130507827, "learning_rate": 4.5500000000000005e-06, "loss": 0.0, "reward": 2.71875, "reward_std": 1.6719967126846313, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 92 }, { "completion_length": 566.5, "epoch": 0.025776053215077605, "grad_norm": 0.25328919291496277, "kl": 0.0009656297042965889, "learning_rate": 4.600000000000001e-06, "loss": 0.0, "reward": 3.21875, "reward_std": 1.8268113136291504, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 93 }, { "completion_length": 593.75, "epoch": 0.026053215077605323, "grad_norm": 0.3329068124294281, "kl": 0.0005953624495305121, "learning_rate": 4.65e-06, "loss": -0.0, "reward": 2.59375, "reward_std": 1.4626424312591553, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 94 }, { "completion_length": 521.0, "epoch": 0.026330376940133036, "grad_norm": 0.24568210542201996, "kl": 0.000615741650108248, "learning_rate": 4.7e-06, "loss": -0.0, "reward": 1.84375, "reward_std": 0.5340002179145813, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 95 }, { "completion_length": 523.25, "epoch": 0.026607538802660754, "grad_norm": 0.2706901431083679, "kl": 0.0006628820556215942, "learning_rate": 4.75e-06, "loss": 0.0, "reward": 3.34375, "reward_std": 1.25571608543396, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 96 }, { "completion_length": 560.5, "epoch": 0.02688470066518847, "grad_norm": 0.24192999303340912, "kl": 0.0010129789588972926, "learning_rate": 4.800000000000001e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 97 }, { "completion_length": 579.75, "epoch": 0.027161862527716185, "grad_norm": 0.2865859866142273, "kl": 0.0008974914671853185, "learning_rate": 4.85e-06, "loss": 0.0, "reward": 4.0, "reward_std": 1.5, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 98 }, { "completion_length": 562.0, "epoch": 0.027439024390243903, "grad_norm": 0.24675221741199493, "kl": 0.0008242416079156101, "learning_rate": 4.9000000000000005e-06, "loss": -0.0, "reward": 2.5, "reward_std": 2.3273732662200928, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 99 }, { "completion_length": 476.0, "epoch": 0.02771618625277162, "grad_norm": 0.3172301650047302, "kl": 0.0005580474389716983, "learning_rate": 4.95e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.25, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 100 }, { "completion_length": 454.5, "epoch": 0.027993348115299334, "grad_norm": 0.3262696862220764, "kl": 0.0009546653600409627, "learning_rate": 5e-06, "loss": 0.0, "reward": 3.5, "reward_std": 1.443375587463379, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 101 }, { "completion_length": 646.75, "epoch": 0.02827050997782705, "grad_norm": 0.21601717174053192, "kl": 0.0005823949468322098, "learning_rate": 4.9999999616677006e-06, "loss": 0.0, "reward": 2.4375, "reward_std": 3.8249454498291016, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.4375, "step": 102 }, { "completion_length": 725.25, "epoch": 0.02854767184035477, "grad_norm": 0.2702260911464691, "kl": 0.0007772979442961514, "learning_rate": 4.999999846670801e-06, "loss": -0.0, "reward": 3.6875, "reward_std": 2.355180501937866, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 103 }, { "completion_length": 566.25, "epoch": 0.028824833702882482, "grad_norm": 0.2261461317539215, "kl": 0.0008430497255176306, "learning_rate": 4.999999655009307e-06, "loss": -0.0, "reward": 1.5625, "reward_std": 0.2975594997406006, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.6875, "step": 104 }, { "completion_length": 449.5, "epoch": 0.0291019955654102, "grad_norm": 0.2898308038711548, "kl": 0.0013253889046609402, "learning_rate": 4.9999993866832215e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 105 }, { "completion_length": 655.75, "epoch": 0.029379157427937917, "grad_norm": 0.2288985699415207, "kl": 0.0013301436556503177, "learning_rate": 4.999999041692556e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 106 }, { "completion_length": 582.25, "epoch": 0.02965631929046563, "grad_norm": 0.24303385615348816, "kl": 0.0010237114038318396, "learning_rate": 4.999998620037319e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 107 }, { "completion_length": 707.5, "epoch": 0.02993348115299335, "grad_norm": 0.2842788100242615, "kl": 0.0006655353354290128, "learning_rate": 4.999998121717524e-06, "loss": -0.0, "reward": 2.84375, "reward_std": 3.420792818069458, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 108 }, { "completion_length": 508.0, "epoch": 0.030210643015521066, "grad_norm": 0.3998611569404602, "kl": 0.0024813413619995117, "learning_rate": 4.999997546733187e-06, "loss": -0.0, "reward": 4.09375, "reward_std": 1.280360460281372, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.71875, "step": 109 }, { "completion_length": 563.5, "epoch": 0.03048780487804878, "grad_norm": 0.24261142313480377, "kl": 0.0015548549126833677, "learning_rate": 4.9999968950843245e-06, "loss": 0.0, "reward": 4.34375, "reward_std": 2.197003126144409, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 110 }, { "completion_length": 557.75, "epoch": 0.030764966740576497, "grad_norm": 0.2447528839111328, "kl": 0.0012562781339511275, "learning_rate": 4.999996166770957e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 111 }, { "completion_length": 535.75, "epoch": 0.031042128603104215, "grad_norm": 0.2839403748512268, "kl": 0.0015167887322604656, "learning_rate": 4.9999953617931074e-06, "loss": -0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 112 }, { "completion_length": 544.75, "epoch": 0.03131929046563193, "grad_norm": 0.2637383043766022, "kl": 0.0013641542755067348, "learning_rate": 4.9999944801508e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 113 }, { "completion_length": 540.0, "epoch": 0.03159645232815964, "grad_norm": 0.24956463277339935, "kl": 0.0013399209128692746, "learning_rate": 4.99999352184406e-06, "loss": -0.0, "reward": 3.875, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 114 }, { "completion_length": 531.0, "epoch": 0.03187361419068736, "grad_norm": 0.3293284475803375, "kl": 0.0010093465680256486, "learning_rate": 4.999992486872919e-06, "loss": -0.0, "reward": 4.3125, "reward_std": 2.0039024353027344, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5625, "step": 115 }, { "completion_length": 466.75, "epoch": 0.03215077605321508, "grad_norm": 0.3033599555492401, "kl": 0.0018283403478562832, "learning_rate": 4.999991375237408e-06, "loss": 0.0, "reward": 2.9375, "reward_std": 3.009533405303955, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5625, "step": 116 }, { "completion_length": 447.75, "epoch": 0.03242793791574279, "grad_norm": 0.46602895855903625, "kl": 0.0013328734785318375, "learning_rate": 4.999990186937562e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 117 }, { "completion_length": 509.25, "epoch": 0.03270509977827051, "grad_norm": 0.29622921347618103, "kl": 0.0012848273618146777, "learning_rate": 4.9999889219734165e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 118 }, { "completion_length": 448.25, "epoch": 0.032982261640798226, "grad_norm": 0.3296196162700653, "kl": 0.0022653101477771997, "learning_rate": 4.999987580345011e-06, "loss": 0.0, "reward": 3.21875, "reward_std": 1.444728136062622, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 119 }, { "completion_length": 889.5, "epoch": 0.03325942350332594, "grad_norm": 0.22369661927223206, "kl": 0.0016949750715866685, "learning_rate": 4.999986162052384e-06, "loss": 0.0, "reward": 2.21875, "reward_std": 2.7240729331970215, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 120 }, { "completion_length": 571.25, "epoch": 0.03353658536585366, "grad_norm": 0.2820369303226471, "kl": 0.0020861346274614334, "learning_rate": 4.999984667095583e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.25, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 121 }, { "completion_length": 461.25, "epoch": 0.033813747228381374, "grad_norm": 0.30351871252059937, "kl": 0.0023725288920104504, "learning_rate": 4.999983095474651e-06, "loss": -0.0, "reward": 3.4375, "reward_std": 2.192553758621216, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 122 }, { "completion_length": 605.75, "epoch": 0.03409090909090909, "grad_norm": 0.30575117468833923, "kl": 0.00228232447989285, "learning_rate": 4.999981447189638e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 123 }, { "completion_length": 491.25, "epoch": 0.03436807095343681, "grad_norm": 0.24359185993671417, "kl": 0.0014123540604487062, "learning_rate": 4.999979722240593e-06, "loss": 0.0, "reward": 1.71875, "reward_std": 0.709863543510437, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 124 }, { "completion_length": 531.25, "epoch": 0.03464523281596452, "grad_norm": 0.24107161164283752, "kl": 0.0018883803859353065, "learning_rate": 4.999977920627569e-06, "loss": -0.0, "reward": 4.0, "reward_std": 2.020725965499878, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 125 }, { "completion_length": 498.75, "epoch": 0.03492239467849224, "grad_norm": 0.2970273196697235, "kl": 0.0022087255492806435, "learning_rate": 4.999976042350623e-06, "loss": -0.0, "reward": 2.6875, "reward_std": 1.3900688886642456, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 126 }, { "completion_length": 486.0, "epoch": 0.03519955654101996, "grad_norm": 0.2743111550807953, "kl": 0.0012705744011327624, "learning_rate": 4.999974087409812e-06, "loss": -0.0, "reward": 3.5, "reward_std": 1.443375587463379, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 127 }, { "completion_length": 592.5, "epoch": 0.03547671840354767, "grad_norm": 0.24681898951530457, "kl": 0.0016977920895442367, "learning_rate": 4.999972055805193e-06, "loss": 0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 128 }, { "completion_length": 506.5, "epoch": 0.035753880266075386, "grad_norm": 0.3356212377548218, "kl": 0.0030539000872522593, "learning_rate": 4.9999699475368325e-06, "loss": -0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 129 }, { "completion_length": 621.5, "epoch": 0.03603104212860311, "grad_norm": 0.25449541211128235, "kl": 0.0020835150498896837, "learning_rate": 4.999967762604793e-06, "loss": 0.0, "reward": 4.46875, "reward_std": 1.6657549142837524, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 130 }, { "completion_length": 529.25, "epoch": 0.03630820399113082, "grad_norm": 0.2534700036048889, "kl": 0.002556864870712161, "learning_rate": 4.999965501009142e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 131 }, { "completion_length": 520.25, "epoch": 0.036585365853658534, "grad_norm": 0.27563413977622986, "kl": 0.002087092027068138, "learning_rate": 4.999963162749948e-06, "loss": -0.0, "reward": 5.53125, "reward_std": 0.2576940953731537, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.65625, "step": 132 }, { "completion_length": 526.5, "epoch": 0.036862527716186255, "grad_norm": 0.22601450979709625, "kl": 0.0015554060228168964, "learning_rate": 4.999960747827284e-06, "loss": -0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 133 }, { "completion_length": 441.75, "epoch": 0.03713968957871397, "grad_norm": 0.2940211892127991, "kl": 0.0020694448612630367, "learning_rate": 4.999958256241223e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.5545631647109985, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 134 }, { "completion_length": 518.75, "epoch": 0.03741685144124168, "grad_norm": 0.28863954544067383, "kl": 0.0021078474819660187, "learning_rate": 4.999955687991842e-06, "loss": -0.0, "reward": 4.75, "reward_std": 1.6832507848739624, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 135 }, { "completion_length": 522.0, "epoch": 0.037694013303769404, "grad_norm": 0.3035510182380676, "kl": 0.0023396280594170094, "learning_rate": 4.99995304307922e-06, "loss": 0.0, "reward": 3.25, "reward_std": 2.3804759979248047, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 136 }, { "completion_length": 556.25, "epoch": 0.03797117516629712, "grad_norm": 0.26021718978881836, "kl": 0.003481579013168812, "learning_rate": 4.9999503215034375e-06, "loss": -0.0, "reward": 3.96875, "reward_std": 2.10746431350708, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 137 }, { "completion_length": 519.25, "epoch": 0.03824833702882483, "grad_norm": 0.25590893626213074, "kl": 0.0046329558826982975, "learning_rate": 4.999947523264577e-06, "loss": -0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 138 }, { "completion_length": 519.5, "epoch": 0.03852549889135255, "grad_norm": 0.3054749369621277, "kl": 0.0029425916727632284, "learning_rate": 4.999944648362727e-06, "loss": 0.0, "reward": 4.21875, "reward_std": 1.7922722101211548, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 139 }, { "completion_length": 543.75, "epoch": 0.038802660753880266, "grad_norm": 0.2765722870826721, "kl": 0.0019459128379821777, "learning_rate": 4.999941696797974e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 140 }, { "completion_length": 592.5, "epoch": 0.03907982261640798, "grad_norm": 0.22871868312358856, "kl": 0.002332570729777217, "learning_rate": 4.999938668570408e-06, "loss": 0.0, "reward": 4.0, "reward_std": 1.5, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 141 }, { "completion_length": 507.75, "epoch": 0.0393569844789357, "grad_norm": 0.3106369972229004, "kl": 0.0017694552661851048, "learning_rate": 4.999935563680123e-06, "loss": -0.0, "reward": 3.0625, "reward_std": 1.7955384254455566, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 142 }, { "completion_length": 515.5, "epoch": 0.039634146341463415, "grad_norm": 0.3508833646774292, "kl": 0.0030222535133361816, "learning_rate": 4.9999323821272135e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 143 }, { "completion_length": 649.75, "epoch": 0.03991130820399113, "grad_norm": 0.3115159869194031, "kl": 0.0046004499308764935, "learning_rate": 4.999929123911778e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 144 }, { "completion_length": 524.5, "epoch": 0.04018847006651885, "grad_norm": 0.25021612644195557, "kl": 0.0030522397719323635, "learning_rate": 4.999925789033915e-06, "loss": -0.0, "reward": 4.5625, "reward_std": 1.6504418849945068, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 145 }, { "completion_length": 422.0, "epoch": 0.040465631929046564, "grad_norm": 0.2983545958995819, "kl": 0.002578801242634654, "learning_rate": 4.999922377493727e-06, "loss": -0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 146 }, { "completion_length": 531.25, "epoch": 0.04074279379157428, "grad_norm": 0.37473031878471375, "kl": 0.005895189009606838, "learning_rate": 4.9999188892913205e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 147 }, { "completion_length": 428.75, "epoch": 0.041019955654102, "grad_norm": 0.33092638850212097, "kl": 0.003967867232859135, "learning_rate": 4.9999153244268e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 148 }, { "completion_length": 467.5, "epoch": 0.04129711751662971, "grad_norm": 0.3725762367248535, "kl": 0.003083718242123723, "learning_rate": 4.999911682900276e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 149 }, { "completion_length": 541.75, "epoch": 0.041574279379157426, "grad_norm": 0.2801760137081146, "kl": 0.004008075222373009, "learning_rate": 4.999907964711861e-06, "loss": -0.0, "reward": 3.9375, "reward_std": 1.883314847946167, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.6875, "step": 150 }, { "completion_length": 516.75, "epoch": 0.04185144124168515, "grad_norm": 0.2908424437046051, "kl": 0.00806091446429491, "learning_rate": 4.999904169861667e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 151 }, { "completion_length": 511.25, "epoch": 0.04212860310421286, "grad_norm": 0.30323347449302673, "kl": 0.003223398234695196, "learning_rate": 4.999900298349811e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 152 }, { "completion_length": 561.5, "epoch": 0.042405764966740575, "grad_norm": 0.3223307430744171, "kl": 0.002532375045120716, "learning_rate": 4.999896350176413e-06, "loss": -0.0, "reward": 2.75, "reward_std": 1.3540064096450806, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 153 }, { "completion_length": 485.0, "epoch": 0.042682926829268296, "grad_norm": 0.0, "kl": 0.003968062344938517, "learning_rate": 4.999892325341593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 154 }, { "completion_length": 450.5, "epoch": 0.04296008869179601, "grad_norm": 0.4350719451904297, "kl": 0.005024564452469349, "learning_rate": 4.999888223845476e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 155 }, { "completion_length": 587.75, "epoch": 0.043237250554323724, "grad_norm": 0.0, "kl": 0.0033939527347683907, "learning_rate": 4.9998840456881844e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 156 }, { "completion_length": 499.5, "epoch": 0.043514412416851445, "grad_norm": 0.26506081223487854, "kl": 0.01056655216962099, "learning_rate": 4.999879790869849e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 157 }, { "completion_length": 440.75, "epoch": 0.04379157427937916, "grad_norm": 0.3518669605255127, "kl": 0.0033067353069782257, "learning_rate": 4.999875459390601e-06, "loss": 0.0, "reward": 2.75, "reward_std": 1.3540064096450806, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 158 }, { "completion_length": 554.25, "epoch": 0.04406873614190687, "grad_norm": 0.24702425301074982, "kl": 0.009299634955823421, "learning_rate": 4.9998710512505704e-06, "loss": 0.0, "reward": 1.625, "reward_std": 0.25, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 159 }, { "completion_length": 562.25, "epoch": 0.04434589800443459, "grad_norm": 0.25372618436813354, "kl": 0.002592406701296568, "learning_rate": 4.999866566449895e-06, "loss": -0.0, "reward": 3.53125, "reward_std": 1.8662992715835571, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 160 }, { "completion_length": 494.0, "epoch": 0.04462305986696231, "grad_norm": 0.3150876760482788, "kl": 0.003243791637942195, "learning_rate": 4.999862004988709e-06, "loss": 0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 161 }, { "completion_length": 566.25, "epoch": 0.04490022172949002, "grad_norm": 0.23036235570907593, "kl": 0.003913012333214283, "learning_rate": 4.999857366867157e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 162 }, { "completion_length": 639.0, "epoch": 0.045177383592017735, "grad_norm": 0.24176982045173645, "kl": 0.0025857016444206238, "learning_rate": 4.9998526520853775e-06, "loss": 0.0, "reward": 3.5, "reward_std": 1.443375587463379, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 163 }, { "completion_length": 549.0, "epoch": 0.045454545454545456, "grad_norm": 0.2966746389865875, "kl": 0.002923133783042431, "learning_rate": 4.999847860643517e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 164 }, { "completion_length": 500.0, "epoch": 0.04573170731707317, "grad_norm": 0.27364498376846313, "kl": 0.004719297401607037, "learning_rate": 4.99984299254172e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 165 }, { "completion_length": 461.5, "epoch": 0.046008869179600884, "grad_norm": 0.35000544786453247, "kl": 0.002936003264039755, "learning_rate": 4.999838047780139e-06, "loss": 0.0, "reward": 2.625, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 166 }, { "completion_length": 447.75, "epoch": 0.046286031042128604, "grad_norm": 0.4923640191555023, "kl": 0.0036999729927629232, "learning_rate": 4.999833026358924e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 167 }, { "completion_length": 456.75, "epoch": 0.04656319290465632, "grad_norm": 0.2979881167411804, "kl": 0.0037487128283828497, "learning_rate": 4.999827928278229e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 168 }, { "completion_length": 458.5, "epoch": 0.04684035476718403, "grad_norm": 0.32495346665382385, "kl": 0.003023212542757392, "learning_rate": 4.9998227535382105e-06, "loss": 0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 169 }, { "completion_length": 440.75, "epoch": 0.04711751662971175, "grad_norm": 0.2944084703922272, "kl": 0.003576485440135002, "learning_rate": 4.999817502139027e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 170 }, { "completion_length": 498.75, "epoch": 0.04739467849223947, "grad_norm": 0.2683544456958771, "kl": 0.003562978934496641, "learning_rate": 4.99981217408084e-06, "loss": -0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 171 }, { "completion_length": 492.5, "epoch": 0.04767184035476718, "grad_norm": 0.37070927023887634, "kl": 0.0037437721621245146, "learning_rate": 4.999806769363812e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 172 }, { "completion_length": 458.75, "epoch": 0.0479490022172949, "grad_norm": 0.33083245158195496, "kl": 0.0027694685850292444, "learning_rate": 4.99980128798811e-06, "loss": -0.0, "reward": 2.03125, "reward_std": 2.204103708267212, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.65625, "step": 173 }, { "completion_length": 533.5, "epoch": 0.048226164079822616, "grad_norm": 0.26569101214408875, "kl": 0.0028758440166711807, "learning_rate": 4.9997957299539014e-06, "loss": -0.0, "reward": 3.78125, "reward_std": 1.74515700340271, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 174 }, { "completion_length": 512.25, "epoch": 0.04850332594235033, "grad_norm": 0.2977157235145569, "kl": 0.004387638997286558, "learning_rate": 4.9997900952613555e-06, "loss": -0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 175 }, { "completion_length": 417.0, "epoch": 0.04878048780487805, "grad_norm": 0.3042905330657959, "kl": 0.01481810212135315, "learning_rate": 4.999784383910647e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 176 }, { "completion_length": 563.5, "epoch": 0.049057649667405764, "grad_norm": 0.264516144990921, "kl": 0.0036362074315547943, "learning_rate": 4.999778595901951e-06, "loss": -0.0, "reward": 3.5, "reward_std": 1.443375587463379, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 177 }, { "completion_length": 473.75, "epoch": 0.04933481152993348, "grad_norm": 0.26864442229270935, "kl": 0.0030112804379314184, "learning_rate": 4.999772731235444e-06, "loss": 0.0, "reward": 2.25, "reward_std": 1.0, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 178 }, { "completion_length": 454.25, "epoch": 0.0496119733924612, "grad_norm": 0.301116019487381, "kl": 0.0038946038112044334, "learning_rate": 4.9997667899113055e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 179 }, { "completion_length": 493.5, "epoch": 0.04988913525498891, "grad_norm": 0.31358036398887634, "kl": 0.00414718734100461, "learning_rate": 4.999760771929719e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 180 }, { "completion_length": 505.5, "epoch": 0.05016629711751663, "grad_norm": 0.2650355100631714, "kl": 0.008234556764364243, "learning_rate": 4.9997546772908675e-06, "loss": 0.0, "reward": 2.21875, "reward_std": 2.092583656311035, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 181 }, { "completion_length": 515.0, "epoch": 0.05044345898004435, "grad_norm": 0.30155012011528015, "kl": 0.004134523682296276, "learning_rate": 4.999748505994939e-06, "loss": -0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 182 }, { "completion_length": 504.75, "epoch": 0.05072062084257206, "grad_norm": 0.3182169795036316, "kl": 0.004539824556559324, "learning_rate": 4.9997422580421225e-06, "loss": -0.0, "reward": 2.96875, "reward_std": 1.899492859840393, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 183 }, { "completion_length": 478.25, "epoch": 0.050997782705099776, "grad_norm": 0.30971816182136536, "kl": 0.0042198048904538155, "learning_rate": 4.999735933432611e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 184 }, { "completion_length": 664.5, "epoch": 0.051274944567627496, "grad_norm": 0.23832754790782928, "kl": 0.0037477167788892984, "learning_rate": 4.999729532166595e-06, "loss": -0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 185 }, { "completion_length": 564.0, "epoch": 0.05155210643015521, "grad_norm": 0.3208759129047394, "kl": 0.0032999326940625906, "learning_rate": 4.999723054244274e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 186 }, { "completion_length": 449.75, "epoch": 0.051829268292682924, "grad_norm": 0.268100380897522, "kl": 0.006639811210334301, "learning_rate": 4.999716499665845e-06, "loss": -0.0, "reward": 4.4375, "reward_std": 2.0142719745635986, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 187 }, { "completion_length": 474.0, "epoch": 0.052106430155210645, "grad_norm": 0.3177604079246521, "kl": 0.004573184531182051, "learning_rate": 4.99970986843151e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 188 }, { "completion_length": 526.5, "epoch": 0.05238359201773836, "grad_norm": 0.29966405034065247, "kl": 0.003981863148510456, "learning_rate": 4.999703160541473e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 189 }, { "completion_length": 577.0, "epoch": 0.05266075388026607, "grad_norm": 0.2751349210739136, "kl": 0.003173339180648327, "learning_rate": 4.999696375995937e-06, "loss": -0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 190 }, { "completion_length": 514.25, "epoch": 0.052937915742793794, "grad_norm": 0.30706390738487244, "kl": 0.016497010365128517, "learning_rate": 4.999689514795112e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 191 }, { "completion_length": 443.5, "epoch": 0.05321507760532151, "grad_norm": 0.30374449491500854, "kl": 0.005075526423752308, "learning_rate": 4.999682576939208e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 192 }, { "completion_length": 535.25, "epoch": 0.05349223946784922, "grad_norm": 0.3113245368003845, "kl": 0.003604709170758724, "learning_rate": 4.999675562428437e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 193 }, { "completion_length": 520.25, "epoch": 0.05376940133037694, "grad_norm": 0.29945746064186096, "kl": 0.004562127869576216, "learning_rate": 4.999668471263016e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 194 }, { "completion_length": 568.75, "epoch": 0.054046563192904656, "grad_norm": 0.25803813338279724, "kl": 0.003753086319193244, "learning_rate": 4.9996613034431605e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 195 }, { "completion_length": 412.5, "epoch": 0.05432372505543237, "grad_norm": 0.3193735182285309, "kl": 0.01812172122299671, "learning_rate": 4.999654058969093e-06, "loss": 0.0, "reward": 4.15625, "reward_std": 1.9131535291671753, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 196 }, { "completion_length": 551.25, "epoch": 0.05460088691796009, "grad_norm": 0.28885871171951294, "kl": 0.003492927411571145, "learning_rate": 4.999646737841032e-06, "loss": -0.0, "reward": 4.59375, "reward_std": 0.3125, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 197 }, { "completion_length": 502.5, "epoch": 0.054878048780487805, "grad_norm": 0.26625505089759827, "kl": 0.004594398662447929, "learning_rate": 4.999639340059204e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 198 }, { "completion_length": 457.75, "epoch": 0.05515521064301552, "grad_norm": 0.2867913246154785, "kl": 0.0034379991702735424, "learning_rate": 4.999631865623836e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 199 }, { "completion_length": 460.5, "epoch": 0.05543237250554324, "grad_norm": 0.5477243661880493, "kl": 0.0052807568572461605, "learning_rate": 4.9996243145351565e-06, "loss": -0.0, "reward": 2.25, "reward_std": 1.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 200 }, { "completion_length": 501.25, "epoch": 0.055709534368070954, "grad_norm": 0.2605167031288147, "kl": 0.0032645564060658216, "learning_rate": 4.999616686793398e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 201 }, { "completion_length": 538.5, "epoch": 0.05598669623059867, "grad_norm": 0.2509101331233978, "kl": 0.004279875196516514, "learning_rate": 4.999608982398793e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 202 }, { "completion_length": 519.75, "epoch": 0.05626385809312639, "grad_norm": 0.27249786257743835, "kl": 0.004391232971101999, "learning_rate": 4.99960120135158e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 203 }, { "completion_length": 399.5, "epoch": 0.0565410199556541, "grad_norm": 0.30695903301239014, "kl": 0.004098030738532543, "learning_rate": 4.999593343651995e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 204 }, { "completion_length": 518.25, "epoch": 0.056818181818181816, "grad_norm": 0.3343191146850586, "kl": 0.0038910030853003263, "learning_rate": 4.999585409300281e-06, "loss": -0.0, "reward": 2.0, "reward_std": 0.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 205 }, { "completion_length": 496.75, "epoch": 0.05709534368070954, "grad_norm": 0.29982176423072815, "kl": 0.003883267054334283, "learning_rate": 4.99957739829668e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 206 }, { "completion_length": 432.25, "epoch": 0.05737250554323725, "grad_norm": 0.3081141412258148, "kl": 0.014592984691262245, "learning_rate": 4.999569310641439e-06, "loss": -0.0, "reward": 1.09375, "reward_std": 0.7995766997337341, "rewards/confident_score_func": -0.5, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 207 }, { "completion_length": 509.5, "epoch": 0.057649667405764965, "grad_norm": 0.29632651805877686, "kl": 0.0042061153799295425, "learning_rate": 4.999561146334804e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 208 }, { "completion_length": 534.25, "epoch": 0.057926829268292686, "grad_norm": 0.2582140266895294, "kl": 0.003703604219481349, "learning_rate": 4.999552905377027e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 209 }, { "completion_length": 512.0, "epoch": 0.0582039911308204, "grad_norm": 0.25513598322868347, "kl": 0.004958077799528837, "learning_rate": 4.999544587768362e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 210 }, { "completion_length": 412.75, "epoch": 0.058481152993348114, "grad_norm": 0.3151702880859375, "kl": 0.004077285062521696, "learning_rate": 4.99953619350906e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 211 }, { "completion_length": 526.0, "epoch": 0.058758314855875834, "grad_norm": 0.30898264050483704, "kl": 0.0024913251399993896, "learning_rate": 4.999527722599381e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 212 }, { "completion_length": 402.75, "epoch": 0.05903547671840355, "grad_norm": 0.31532740592956543, "kl": 0.0042966920882463455, "learning_rate": 4.999519175039585e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 213 }, { "completion_length": 475.75, "epoch": 0.05931263858093126, "grad_norm": 0.3384525179862976, "kl": 0.0043025235645473, "learning_rate": 4.999510550829934e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 214 }, { "completion_length": 549.5, "epoch": 0.05958980044345898, "grad_norm": 0.29684656858444214, "kl": 0.0037625788245350122, "learning_rate": 4.999501849970692e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 215 }, { "completion_length": 470.5, "epoch": 0.0598669623059867, "grad_norm": 0.2916940450668335, "kl": 0.008584964089095592, "learning_rate": 4.999493072462126e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 216 }, { "completion_length": 553.75, "epoch": 0.06014412416851441, "grad_norm": 0.27805331349372864, "kl": 0.004453461151570082, "learning_rate": 4.999484218304504e-06, "loss": -0.0, "reward": 3.40625, "reward_std": 1.5591630935668945, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 217 }, { "completion_length": 443.5, "epoch": 0.06042128603104213, "grad_norm": 0.3378758728504181, "kl": 0.009934209287166595, "learning_rate": 4.9994752874981e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 218 }, { "completion_length": 565.75, "epoch": 0.060698447893569846, "grad_norm": 0.263467013835907, "kl": 0.0037423439789563417, "learning_rate": 4.999466280043186e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 219 }, { "completion_length": 543.0, "epoch": 0.06097560975609756, "grad_norm": 0.2930412292480469, "kl": 0.017738934606313705, "learning_rate": 4.999457195940038e-06, "loss": 0.0, "reward": 3.09375, "reward_std": 2.0950710773468018, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 220 }, { "completion_length": 517.5, "epoch": 0.06125277161862528, "grad_norm": 0.32305464148521423, "kl": 0.003869486041367054, "learning_rate": 4.9994480351889364e-06, "loss": 0.0, "reward": 5.0, "reward_std": 0.5, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 221 }, { "completion_length": 472.75, "epoch": 0.061529933481152994, "grad_norm": 0.3258194029331207, "kl": 0.005644797347486019, "learning_rate": 4.999438797790161e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 222 }, { "completion_length": 494.5, "epoch": 0.06180709534368071, "grad_norm": 0.26710057258605957, "kl": 0.0047498042695224285, "learning_rate": 4.999429483743994e-06, "loss": 0.0, "reward": 2.78125, "reward_std": 1.4302352666854858, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 223 }, { "completion_length": 496.0, "epoch": 0.06208425720620843, "grad_norm": 0.26376762986183167, "kl": 0.0037720282562077045, "learning_rate": 4.999420093050723e-06, "loss": -0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 224 }, { "completion_length": 470.5, "epoch": 0.06236141906873614, "grad_norm": 0.3175991475582123, "kl": 0.003959296271204948, "learning_rate": 4.999410625710635e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 225 }, { "completion_length": 541.5, "epoch": 0.06263858093126386, "grad_norm": 0.24735623598098755, "kl": 0.004283316899091005, "learning_rate": 4.99940108172402e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 226 }, { "completion_length": 487.75, "epoch": 0.06291574279379157, "grad_norm": 0.25262972712516785, "kl": 0.004475950263440609, "learning_rate": 4.999391461091172e-06, "loss": -0.0, "reward": 4.375, "reward_std": 1.4930394887924194, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 227 }, { "completion_length": 543.0, "epoch": 0.06319290465631928, "grad_norm": 0.0, "kl": 0.004400474485009909, "learning_rate": 4.999381763812384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 228 }, { "completion_length": 539.75, "epoch": 0.06347006651884701, "grad_norm": 0.30647414922714233, "kl": 0.00762728089466691, "learning_rate": 4.999371989887955e-06, "loss": 0.0, "reward": 2.59375, "reward_std": 2.71065616607666, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 229 }, { "completion_length": 504.75, "epoch": 0.06374722838137473, "grad_norm": 0.8001681566238403, "kl": 0.0040405988693237305, "learning_rate": 4.999362139318184e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 230 }, { "completion_length": 531.75, "epoch": 0.06402439024390244, "grad_norm": 0.25891485810279846, "kl": 0.004642766900360584, "learning_rate": 4.999352212103373e-06, "loss": -0.0, "reward": 4.0, "reward_std": 1.5, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 231 }, { "completion_length": 564.5, "epoch": 0.06430155210643015, "grad_norm": 0.2624202370643616, "kl": 0.0036628940142691135, "learning_rate": 4.999342208243827e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 232 }, { "completion_length": 519.0, "epoch": 0.06457871396895787, "grad_norm": 0.26207903027534485, "kl": 0.008784472942352295, "learning_rate": 4.9993321277398535e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 233 }, { "completion_length": 439.75, "epoch": 0.06485587583148558, "grad_norm": 0.3140614926815033, "kl": 0.005611008033156395, "learning_rate": 4.999321970591759e-06, "loss": 0.0, "reward": 4.0625, "reward_std": 2.095381736755371, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 234 }, { "completion_length": 563.25, "epoch": 0.06513303769401331, "grad_norm": 0.2644613981246948, "kl": 0.003935815300792456, "learning_rate": 4.999311736799857e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 235 }, { "completion_length": 489.75, "epoch": 0.06541019955654102, "grad_norm": 0.27178606390953064, "kl": 0.0052208672277629375, "learning_rate": 4.999301426364461e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 236 }, { "completion_length": 465.0, "epoch": 0.06568736141906874, "grad_norm": 0.33051714301109314, "kl": 0.004651564639061689, "learning_rate": 4.999291039285887e-06, "loss": -0.0, "reward": 4.4375, "reward_std": 2.0142719745635986, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 237 }, { "completion_length": 570.25, "epoch": 0.06596452328159645, "grad_norm": 0.22187182307243347, "kl": 0.004795686807483435, "learning_rate": 4.999280575564454e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 238 }, { "completion_length": 575.0, "epoch": 0.06624168514412417, "grad_norm": 0.2659749388694763, "kl": 0.00839039497077465, "learning_rate": 4.999270035200483e-06, "loss": -0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 239 }, { "completion_length": 585.0, "epoch": 0.06651884700665188, "grad_norm": 0.23789100348949432, "kl": 0.003910918720066547, "learning_rate": 4.9992594181942955e-06, "loss": -0.0, "reward": 3.5, "reward_std": 1.443375587463379, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 240 }, { "completion_length": 490.25, "epoch": 0.06679600886917961, "grad_norm": 0.2820134460926056, "kl": 0.005339812487363815, "learning_rate": 4.99924872454622e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 241 }, { "completion_length": 563.25, "epoch": 0.06707317073170732, "grad_norm": 0.29395750164985657, "kl": 0.0036959671415388584, "learning_rate": 4.9992379542565804e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 242 }, { "completion_length": 666.75, "epoch": 0.06735033259423504, "grad_norm": 0.265852689743042, "kl": 0.007844896055758, "learning_rate": 4.999227107325711e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 243 }, { "completion_length": 510.5, "epoch": 0.06762749445676275, "grad_norm": 0.2735859155654907, "kl": 0.005522553808987141, "learning_rate": 4.999216183753942e-06, "loss": -0.0, "reward": 2.125, "reward_std": 0.25, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 244 }, { "completion_length": 543.25, "epoch": 0.06790465631929046, "grad_norm": 0.2506347894668579, "kl": 0.004082122817635536, "learning_rate": 4.99920518354161e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 245 }, { "completion_length": 535.0, "epoch": 0.06818181818181818, "grad_norm": 0.26216921210289, "kl": 0.00627948297187686, "learning_rate": 4.99919410668905e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 246 }, { "completion_length": 412.75, "epoch": 0.0684589800443459, "grad_norm": 0.34052497148513794, "kl": 0.005706881172955036, "learning_rate": 4.999182953196604e-06, "loss": -0.0, "reward": 2.75, "reward_std": 1.4142135381698608, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 247 }, { "completion_length": 552.5, "epoch": 0.06873614190687362, "grad_norm": 0.2775396406650543, "kl": 0.00552940322086215, "learning_rate": 4.999171723064614e-06, "loss": -0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 248 }, { "completion_length": 871.75, "epoch": 0.06901330376940133, "grad_norm": 0.1770574450492859, "kl": 0.010729802772402763, "learning_rate": 4.999160416293422e-06, "loss": 0.0, "reward": 1.1875, "reward_std": 1.4012643098831177, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.5625, "step": 249 }, { "completion_length": 455.0, "epoch": 0.06929046563192905, "grad_norm": 0.32184749841690063, "kl": 0.004980409052222967, "learning_rate": 4.999149032883377e-06, "loss": -0.0, "reward": 3.375, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 250 }, { "completion_length": 514.5, "epoch": 0.06956762749445676, "grad_norm": 0.25205788016319275, "kl": 0.011082429438829422, "learning_rate": 4.999137572834828e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.25, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 251 }, { "completion_length": 549.75, "epoch": 0.06984478935698447, "grad_norm": 0.26392266154289246, "kl": 0.005242338869720697, "learning_rate": 4.999126036148125e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 252 }, { "completion_length": 462.5, "epoch": 0.0701219512195122, "grad_norm": 0.3302958607673645, "kl": 0.0060844034887850285, "learning_rate": 4.999114422823622e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 253 }, { "completion_length": 537.5, "epoch": 0.07039911308203992, "grad_norm": 0.32323965430259705, "kl": 0.0115456348285079, "learning_rate": 4.999102732861677e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 254 }, { "completion_length": 490.25, "epoch": 0.07067627494456763, "grad_norm": 0.303234338760376, "kl": 0.007120540365576744, "learning_rate": 4.999090966262646e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 255 }, { "completion_length": 490.75, "epoch": 0.07095343680709534, "grad_norm": 0.3324601352214813, "kl": 0.007813826203346252, "learning_rate": 4.999079123026892e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 256 }, { "completion_length": 465.5, "epoch": 0.07123059866962306, "grad_norm": 0.3458840250968933, "kl": 0.005496314261108637, "learning_rate": 4.999067203154777e-06, "loss": -0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 257 }, { "completion_length": 583.0, "epoch": 0.07150776053215077, "grad_norm": 0.3096715807914734, "kl": 0.00865244772285223, "learning_rate": 4.999055206646667e-06, "loss": 0.0, "reward": 2.6875, "reward_std": 2.7262229919433594, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5625, "step": 258 }, { "completion_length": 480.25, "epoch": 0.07178492239467849, "grad_norm": 0.2842094302177429, "kl": 0.0067632971331477165, "learning_rate": 4.999043133502929e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 259 }, { "completion_length": 545.75, "epoch": 0.07206208425720621, "grad_norm": 0.27135348320007324, "kl": 0.00945165567100048, "learning_rate": 4.999030983723934e-06, "loss": 0.0, "reward": 4.375, "reward_std": 1.4930394887924194, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 260 }, { "completion_length": 525.25, "epoch": 0.07233924611973393, "grad_norm": 0.29009535908699036, "kl": 0.005237874574959278, "learning_rate": 4.999018757310055e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 261 }, { "completion_length": 487.75, "epoch": 0.07261640798226164, "grad_norm": 0.30857548117637634, "kl": 0.00764172850176692, "learning_rate": 4.999006454261665e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 262 }, { "completion_length": 544.25, "epoch": 0.07289356984478935, "grad_norm": 0.2531295716762543, "kl": 0.005134346429258585, "learning_rate": 4.998994074579144e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 263 }, { "completion_length": 480.0, "epoch": 0.07317073170731707, "grad_norm": 0.33574023842811584, "kl": 0.006391293369233608, "learning_rate": 4.99898161826287e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 264 }, { "completion_length": 497.0, "epoch": 0.07344789356984478, "grad_norm": 0.2729950249195099, "kl": 0.015186267904937267, "learning_rate": 4.998969085313225e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 265 }, { "completion_length": 430.0, "epoch": 0.07372505543237251, "grad_norm": 0.34028229117393494, "kl": 0.007000184152275324, "learning_rate": 4.998956475730593e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 266 }, { "completion_length": 618.75, "epoch": 0.07400221729490022, "grad_norm": 0.23451335728168488, "kl": 0.00906932633370161, "learning_rate": 4.998943789515363e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 267 }, { "completion_length": 477.75, "epoch": 0.07427937915742794, "grad_norm": 0.3186436593532562, "kl": 0.006322004366666079, "learning_rate": 4.998931026667921e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 268 }, { "completion_length": 623.0, "epoch": 0.07455654101995565, "grad_norm": 0.2667928636074066, "kl": 0.00692044897004962, "learning_rate": 4.99891818718866e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 269 }, { "completion_length": 483.25, "epoch": 0.07483370288248337, "grad_norm": 0.3931860029697418, "kl": 0.006671704351902008, "learning_rate": 4.9989052710779735e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 270 }, { "completion_length": 506.25, "epoch": 0.07511086474501108, "grad_norm": 0.27710139751434326, "kl": 0.006162886042147875, "learning_rate": 4.998892278336257e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 271 }, { "completion_length": 525.25, "epoch": 0.07538802660753881, "grad_norm": 0.268667072057724, "kl": 0.010916204191744328, "learning_rate": 4.9988792089639104e-06, "loss": -0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 272 }, { "completion_length": 571.0, "epoch": 0.07566518847006652, "grad_norm": 0.2656092643737793, "kl": 0.006535908207297325, "learning_rate": 4.998866062961333e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 273 }, { "completion_length": 524.25, "epoch": 0.07594235033259424, "grad_norm": 0.2874826192855835, "kl": 0.007042960729449987, "learning_rate": 4.998852840328928e-06, "loss": -0.0, "reward": 4.125, "reward_std": 1.8874585628509521, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 274 }, { "completion_length": 504.75, "epoch": 0.07621951219512195, "grad_norm": 0.26913368701934814, "kl": 0.008896050043404102, "learning_rate": 4.9988395410671024e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 275 }, { "completion_length": 601.5, "epoch": 0.07649667405764966, "grad_norm": 0.23629991710186005, "kl": 0.006967225112020969, "learning_rate": 4.998826165176263e-06, "loss": 0.0, "reward": 4.0, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 276 }, { "completion_length": 489.75, "epoch": 0.07677383592017738, "grad_norm": 0.32739174365997314, "kl": 0.008099419996142387, "learning_rate": 4.998812712656819e-06, "loss": -0.0, "reward": 3.375, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 277 }, { "completion_length": 595.0, "epoch": 0.0770509977827051, "grad_norm": 0.27353352308273315, "kl": 0.006412680726498365, "learning_rate": 4.998799183509184e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 278 }, { "completion_length": 546.75, "epoch": 0.07732815964523282, "grad_norm": 0.2805147171020508, "kl": 0.005846321117132902, "learning_rate": 4.998785577733774e-06, "loss": -0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 279 }, { "completion_length": 527.5, "epoch": 0.07760532150776053, "grad_norm": 0.24775809049606323, "kl": 0.009753408841788769, "learning_rate": 4.998771895331004e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 280 }, { "completion_length": 516.5, "epoch": 0.07788248337028825, "grad_norm": 0.28076377511024475, "kl": 0.007833130657672882, "learning_rate": 4.998758136301295e-06, "loss": 0.0, "reward": 4.28125, "reward_std": 1.473286509513855, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 281 }, { "completion_length": 477.75, "epoch": 0.07815964523281596, "grad_norm": 0.29942917823791504, "kl": 0.009398751892149448, "learning_rate": 4.998744300645068e-06, "loss": 0.0, "reward": 3.71875, "reward_std": 2.3460404872894287, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 282 }, { "completion_length": 483.5, "epoch": 0.07843680709534367, "grad_norm": 0.2916673719882965, "kl": 0.010522013530135155, "learning_rate": 4.9987303883627484e-06, "loss": 0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 283 }, { "completion_length": 526.5, "epoch": 0.0787139689578714, "grad_norm": 0.2725765109062195, "kl": 0.006917167920619249, "learning_rate": 4.998716399454762e-06, "loss": 0.0, "reward": 4.71875, "reward_std": 1.6719967126846313, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 284 }, { "completion_length": 500.25, "epoch": 0.07899113082039912, "grad_norm": 0.27490487694740295, "kl": 0.009358661249279976, "learning_rate": 4.998702333921538e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 285 }, { "completion_length": 518.0, "epoch": 0.07926829268292683, "grad_norm": 0.31717202067375183, "kl": 0.012628131546080112, "learning_rate": 4.998688191763508e-06, "loss": 0.0, "reward": 3.34375, "reward_std": 1.9562267065048218, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 286 }, { "completion_length": 595.5, "epoch": 0.07954545454545454, "grad_norm": 0.2649040222167969, "kl": 0.006281724199652672, "learning_rate": 4.998673972981105e-06, "loss": 0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 287 }, { "completion_length": 500.5, "epoch": 0.07982261640798226, "grad_norm": 0.3368074893951416, "kl": 0.010276351124048233, "learning_rate": 4.9986596775747655e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 288 }, { "completion_length": 592.5, "epoch": 0.08009977827050997, "grad_norm": 0.2393447309732437, "kl": 0.0074746571481227875, "learning_rate": 4.998645305544928e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 289 }, { "completion_length": 592.75, "epoch": 0.0803769401330377, "grad_norm": 0.2616846561431885, "kl": 0.005527774337679148, "learning_rate": 4.998630856892033e-06, "loss": 0.0, "reward": 2.65625, "reward_std": 2.0700619220733643, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 290 }, { "completion_length": 511.0, "epoch": 0.08065410199556541, "grad_norm": 0.309346079826355, "kl": 0.008619229309260845, "learning_rate": 4.998616331616524e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.314977765083313, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 291 }, { "completion_length": 565.25, "epoch": 0.08093126385809313, "grad_norm": 0.2505832612514496, "kl": 0.00758202001452446, "learning_rate": 4.998601729718846e-06, "loss": 0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 292 }, { "completion_length": 512.5, "epoch": 0.08120842572062084, "grad_norm": 0.3840636909008026, "kl": 0.005987056531012058, "learning_rate": 4.998587051199447e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 293 }, { "completion_length": 500.0, "epoch": 0.08148558758314856, "grad_norm": 0.2677765190601349, "kl": 0.0075174071826040745, "learning_rate": 4.998572296058777e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 294 }, { "completion_length": 546.25, "epoch": 0.08176274944567627, "grad_norm": 0.2772693932056427, "kl": 0.025189436972141266, "learning_rate": 4.998557464297288e-06, "loss": -0.0, "reward": 2.59375, "reward_std": 1.4626424312591553, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 295 }, { "completion_length": 505.5, "epoch": 0.082039911308204, "grad_norm": 0.27071747183799744, "kl": 0.007671544793993235, "learning_rate": 4.998542555915435e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 296 }, { "completion_length": 608.5, "epoch": 0.08231707317073171, "grad_norm": 0.25338616967201233, "kl": 0.007628303486853838, "learning_rate": 4.998527570913676e-06, "loss": -0.0, "reward": 2.375, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 297 }, { "completion_length": 606.5, "epoch": 0.08259423503325942, "grad_norm": 0.23785322904586792, "kl": 0.007008650805801153, "learning_rate": 4.998512509292471e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 298 }, { "completion_length": 574.75, "epoch": 0.08287139689578714, "grad_norm": 0.2740609347820282, "kl": 0.005773882381618023, "learning_rate": 4.99849737105228e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 299 }, { "completion_length": 519.75, "epoch": 0.08314855875831485, "grad_norm": 0.3043326139450073, "kl": 0.007631715852767229, "learning_rate": 4.998482156193568e-06, "loss": -0.0, "reward": 2.375, "reward_std": 0.9464846849441528, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 300 }, { "completion_length": 504.0, "epoch": 0.08342572062084257, "grad_norm": 1.0090659856796265, "kl": 0.007216877769678831, "learning_rate": 4.998466864716801e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 301 }, { "completion_length": 571.5, "epoch": 0.0837028824833703, "grad_norm": 0.28385403752326965, "kl": 0.0069475905038416386, "learning_rate": 4.9984514966224505e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 302 }, { "completion_length": 1069.0, "epoch": 0.08398004434589801, "grad_norm": 0.18082185089588165, "kl": 0.0047687627375125885, "learning_rate": 4.998436051910985e-06, "loss": 0.0, "reward": 2.46875, "reward_std": 2.7125768661499023, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 303 }, { "completion_length": 678.75, "epoch": 0.08425720620842572, "grad_norm": 0.22086772322654724, "kl": 0.005693936720490456, "learning_rate": 4.998420530582878e-06, "loss": 0.0, "reward": 3.125, "reward_std": 1.7969882488250732, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 304 }, { "completion_length": 571.75, "epoch": 0.08453436807095344, "grad_norm": 0.2723041772842407, "kl": 0.006788196973502636, "learning_rate": 4.998404932638608e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 305 }, { "completion_length": 591.5, "epoch": 0.08481152993348115, "grad_norm": 0.2589077651500702, "kl": 0.005792629439383745, "learning_rate": 4.9983892580786505e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 306 }, { "completion_length": 688.5, "epoch": 0.08508869179600886, "grad_norm": 0.26657506823539734, "kl": 0.009044020436704159, "learning_rate": 4.998373506903488e-06, "loss": -0.0, "reward": 4.375, "reward_std": 1.4930394887924194, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 307 }, { "completion_length": 540.25, "epoch": 0.08536585365853659, "grad_norm": 0.2896694540977478, "kl": 0.00786986667662859, "learning_rate": 4.998357679113604e-06, "loss": 0.0, "reward": 4.0, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 308 }, { "completion_length": 547.0, "epoch": 0.0856430155210643, "grad_norm": 0.23579251766204834, "kl": 0.008822621777653694, "learning_rate": 4.998341774709482e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 309 }, { "completion_length": 580.75, "epoch": 0.08592017738359202, "grad_norm": 0.24550193548202515, "kl": 0.00836852379143238, "learning_rate": 4.998325793691611e-06, "loss": -0.0, "reward": 1.6875, "reward_std": 0.5153881907463074, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 310 }, { "completion_length": 540.5, "epoch": 0.08619733924611973, "grad_norm": 0.27241185307502747, "kl": 0.006924949120730162, "learning_rate": 4.99830973606048e-06, "loss": 0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 311 }, { "completion_length": 544.0, "epoch": 0.08647450110864745, "grad_norm": 0.30217209458351135, "kl": 0.008890831843018532, "learning_rate": 4.998293601816582e-06, "loss": 0.0, "reward": 4.0, "reward_std": 2.020725965499878, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 312 }, { "completion_length": 514.25, "epoch": 0.08675166297117516, "grad_norm": 0.2723393738269806, "kl": 0.006328696850687265, "learning_rate": 4.998277390960413e-06, "loss": -0.0, "reward": 4.375, "reward_std": 1.8874585628509521, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 313 }, { "completion_length": 456.25, "epoch": 0.08702882483370289, "grad_norm": 0.3072836697101593, "kl": 0.007432885002344847, "learning_rate": 4.998261103492468e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 314 }, { "completion_length": 446.5, "epoch": 0.0873059866962306, "grad_norm": 0.2854975461959839, "kl": 0.008049027062952518, "learning_rate": 4.9982447394132475e-06, "loss": -0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 315 }, { "completion_length": 566.25, "epoch": 0.08758314855875832, "grad_norm": 0.23696820437908173, "kl": 0.0104129109531641, "learning_rate": 4.998228298723254e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 316 }, { "completion_length": 609.0, "epoch": 0.08786031042128603, "grad_norm": 0.30479153990745544, "kl": 0.008555449545383453, "learning_rate": 4.998211781422991e-06, "loss": -0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 317 }, { "completion_length": 641.75, "epoch": 0.08813747228381374, "grad_norm": 0.2540670335292816, "kl": 0.00729199917986989, "learning_rate": 4.998195187512964e-06, "loss": -0.0, "reward": 3.375, "reward_std": 2.212653160095215, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 318 }, { "completion_length": 473.5, "epoch": 0.08841463414634146, "grad_norm": 0.3386506736278534, "kl": 0.00992652028799057, "learning_rate": 4.998178516993683e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 319 }, { "completion_length": 597.0, "epoch": 0.08869179600886919, "grad_norm": 0.26284879446029663, "kl": 0.013334193266928196, "learning_rate": 4.99816176986566e-06, "loss": 0.0, "reward": 2.75, "reward_std": 1.3540064096450806, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 320 }, { "completion_length": 492.0, "epoch": 0.0889689578713969, "grad_norm": 0.2996481955051422, "kl": 0.0075319381430745125, "learning_rate": 4.998144946129407e-06, "loss": 0.0, "reward": 3.625, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 321 }, { "completion_length": 529.75, "epoch": 0.08924611973392461, "grad_norm": 0.0, "kl": 0.00888336356729269, "learning_rate": 4.9981280457854406e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 322 }, { "completion_length": 501.0, "epoch": 0.08952328159645233, "grad_norm": 0.3193089962005615, "kl": 0.008006121963262558, "learning_rate": 4.998111068834278e-06, "loss": -0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 323 }, { "completion_length": 518.0, "epoch": 0.08980044345898004, "grad_norm": 0.28511443734169006, "kl": 0.010036446154117584, "learning_rate": 4.998094015276442e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 324 }, { "completion_length": 507.75, "epoch": 0.09007760532150776, "grad_norm": 0.5216866135597229, "kl": 0.010400841943919659, "learning_rate": 4.998076885112454e-06, "loss": -0.0, "reward": 3.0, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 325 }, { "completion_length": 554.25, "epoch": 0.09035476718403547, "grad_norm": 0.2701990604400635, "kl": 0.010388349182903767, "learning_rate": 4.99805967834284e-06, "loss": 0.0, "reward": 3.125, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 326 }, { "completion_length": 577.25, "epoch": 0.0906319290465632, "grad_norm": 0.2696262001991272, "kl": 0.01161970105022192, "learning_rate": 4.998042394968127e-06, "loss": 0.0, "reward": 4.84375, "reward_std": 1.8125, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 327 }, { "completion_length": 583.25, "epoch": 0.09090909090909091, "grad_norm": 0.29645928740501404, "kl": 0.008947372436523438, "learning_rate": 4.998025034988846e-06, "loss": -0.0, "reward": 3.96875, "reward_std": 2.9339518547058105, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 328 }, { "completion_length": 564.0, "epoch": 0.09118625277161863, "grad_norm": 0.25202324986457825, "kl": 0.01646338775753975, "learning_rate": 4.998007598405527e-06, "loss": 0.0, "reward": 3.125, "reward_std": 1.75, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 329 }, { "completion_length": 526.75, "epoch": 0.09146341463414634, "grad_norm": 0.27728745341300964, "kl": 0.009050238877534866, "learning_rate": 4.997990085218709e-06, "loss": -0.0, "reward": 3.625, "reward_std": 1.314977765083313, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 330 }, { "completion_length": 650.5, "epoch": 0.09174057649667405, "grad_norm": 0.24188946187496185, "kl": 0.00791819766163826, "learning_rate": 4.997972495428924e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 331 }, { "completion_length": 931.75, "epoch": 0.09201773835920177, "grad_norm": 0.2899530231952667, "kl": 0.008083630353212357, "learning_rate": 4.997954829036715e-06, "loss": 0.0, "reward": 1.84375, "reward_std": 2.9072020053863525, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.25, "rewards/soft_format_reward_func": 0.25, "rewards/xmlcount_reward_func": 0.46875, "step": 332 }, { "completion_length": 519.0, "epoch": 0.0922949002217295, "grad_norm": 0.3107350170612335, "kl": 0.015942372381687164, "learning_rate": 4.997937086042623e-06, "loss": -0.0, "reward": 3.59375, "reward_std": 2.5028629302978516, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 333 }, { "completion_length": 585.0, "epoch": 0.09257206208425721, "grad_norm": 0.24886614084243774, "kl": 0.007958298549056053, "learning_rate": 4.997919266447191e-06, "loss": 0.0, "reward": 5.0, "reward_std": 0.5, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 334 }, { "completion_length": 579.0, "epoch": 0.09284922394678492, "grad_norm": 0.2713007926940918, "kl": 0.01936263032257557, "learning_rate": 4.997901370250966e-06, "loss": -0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 335 }, { "completion_length": 421.75, "epoch": 0.09312638580931264, "grad_norm": 0.32639744877815247, "kl": 0.012141681276261806, "learning_rate": 4.997883397454498e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 336 }, { "completion_length": 576.0, "epoch": 0.09340354767184035, "grad_norm": 0.26375219225883484, "kl": 0.00893515907227993, "learning_rate": 4.997865348058337e-06, "loss": 0.0, "reward": 3.125, "reward_std": 1.3768926858901978, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 337 }, { "completion_length": 495.75, "epoch": 0.09368070953436806, "grad_norm": 0.28011229634284973, "kl": 0.014946239069104195, "learning_rate": 4.9978472220630356e-06, "loss": -0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 338 }, { "completion_length": 518.25, "epoch": 0.09395787139689579, "grad_norm": 0.0, "kl": 0.012209571897983551, "learning_rate": 4.997829019469151e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 339 }, { "completion_length": 537.25, "epoch": 0.0942350332594235, "grad_norm": 0.2664746642112732, "kl": 0.009598378092050552, "learning_rate": 4.997810740277242e-06, "loss": 0.0, "reward": 3.125, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 340 }, { "completion_length": 467.75, "epoch": 0.09451219512195122, "grad_norm": 0.29182934761047363, "kl": 0.00878247618675232, "learning_rate": 4.997792384487867e-06, "loss": 0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 341 }, { "completion_length": 578.75, "epoch": 0.09478935698447893, "grad_norm": 0.0, "kl": 0.008617885410785675, "learning_rate": 4.997773952101591e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 342 }, { "completion_length": 600.75, "epoch": 0.09506651884700665, "grad_norm": 0.24266192317008972, "kl": 0.010930730029940605, "learning_rate": 4.9977554431189786e-06, "loss": 0.0, "reward": 4.375, "reward_std": 1.4930394887924194, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 343 }, { "completion_length": 642.75, "epoch": 0.09534368070953436, "grad_norm": 0.21375784277915955, "kl": 0.006948431022465229, "learning_rate": 4.9977368575405965e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 344 }, { "completion_length": 540.25, "epoch": 0.09562084257206209, "grad_norm": 0.2864329218864441, "kl": 0.009313181973993778, "learning_rate": 4.997718195367015e-06, "loss": 0.0, "reward": 3.34375, "reward_std": 2.253180503845215, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 345 }, { "completion_length": 563.0, "epoch": 0.0958980044345898, "grad_norm": 0.25084638595581055, "kl": 0.007333079352974892, "learning_rate": 4.9976994565988076e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 346 }, { "completion_length": 518.75, "epoch": 0.09617516629711752, "grad_norm": 0.2824084758758545, "kl": 0.008606897667050362, "learning_rate": 4.9976806412365475e-06, "loss": -0.0, "reward": 3.375, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 347 }, { "completion_length": 645.5, "epoch": 0.09645232815964523, "grad_norm": 0.19883979856967926, "kl": 0.010571745224297047, "learning_rate": 4.997661749280812e-06, "loss": 0.0, "reward": 2.78125, "reward_std": 1.74515700340271, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 348 }, { "completion_length": 514.5, "epoch": 0.09672949002217295, "grad_norm": 0.2860449552536011, "kl": 0.011602706275880337, "learning_rate": 4.997642780732181e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 349 }, { "completion_length": 526.5, "epoch": 0.09700665188470066, "grad_norm": 0.291399747133255, "kl": 0.011087669059634209, "learning_rate": 4.997623735591236e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 350 }, { "completion_length": 532.0, "epoch": 0.09728381374722839, "grad_norm": 0.29351261258125305, "kl": 0.010485474951565266, "learning_rate": 4.997604613858561e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 351 }, { "completion_length": 510.25, "epoch": 0.0975609756097561, "grad_norm": 0.33261212706565857, "kl": 0.010849088430404663, "learning_rate": 4.997585415534742e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 352 }, { "completion_length": 550.5, "epoch": 0.09783813747228381, "grad_norm": 0.2817225754261017, "kl": 0.008797934278845787, "learning_rate": 4.9975661406203676e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 353 }, { "completion_length": 477.0, "epoch": 0.09811529933481153, "grad_norm": 0.31825873255729675, "kl": 0.010974342003464699, "learning_rate": 4.99754678911603e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 354 }, { "completion_length": 473.75, "epoch": 0.09839246119733924, "grad_norm": 0.0, "kl": 0.0131355756893754, "learning_rate": 4.9975273610223215e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 355 }, { "completion_length": 632.25, "epoch": 0.09866962305986696, "grad_norm": 0.2742297053337097, "kl": 0.006825553253293037, "learning_rate": 4.997507856339839e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.041241407394409, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 356 }, { "completion_length": 615.75, "epoch": 0.09894678492239468, "grad_norm": 0.291454017162323, "kl": 0.008995168842375278, "learning_rate": 4.997488275069179e-06, "loss": -0.0, "reward": 3.5, "reward_std": 1.443375587463379, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 357 }, { "completion_length": 532.0, "epoch": 0.0992239467849224, "grad_norm": 0.3130916357040405, "kl": 0.010285998694598675, "learning_rate": 4.997468617210943e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 358 }, { "completion_length": 577.5, "epoch": 0.09950110864745011, "grad_norm": 0.0, "kl": 0.008849053643643856, "learning_rate": 4.997448882765734e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 359 }, { "completion_length": 600.5, "epoch": 0.09977827050997783, "grad_norm": 0.28047049045562744, "kl": 0.00806396547704935, "learning_rate": 4.997429071734156e-06, "loss": 0.0, "reward": 2.125, "reward_std": 0.25, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 360 }, { "completion_length": 546.0, "epoch": 0.10005543237250554, "grad_norm": 0.2801942527294159, "kl": 0.00986945629119873, "learning_rate": 4.9974091841168195e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 361 }, { "completion_length": 595.25, "epoch": 0.10033259423503325, "grad_norm": 0.23681581020355225, "kl": 0.009464333765208721, "learning_rate": 4.997389219914331e-06, "loss": -0.0, "reward": 4.125, "reward_std": 1.25, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 362 }, { "completion_length": 618.0, "epoch": 0.10060975609756098, "grad_norm": 0.24660171568393707, "kl": 0.014353018254041672, "learning_rate": 4.997369179127304e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 363 }, { "completion_length": 596.0, "epoch": 0.1008869179600887, "grad_norm": 0.2683423161506653, "kl": 0.008174603804945946, "learning_rate": 4.997349061756353e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 364 }, { "completion_length": 518.25, "epoch": 0.10116407982261641, "grad_norm": 0.0, "kl": 0.012032673694193363, "learning_rate": 4.997328867802095e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 365 }, { "completion_length": 619.0, "epoch": 0.10144124168514412, "grad_norm": 0.2190849334001541, "kl": 0.008783023804426193, "learning_rate": 4.997308597265149e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 366 }, { "completion_length": 490.75, "epoch": 0.10171840354767184, "grad_norm": 0.2837628126144409, "kl": 0.010267123579978943, "learning_rate": 4.997288250146138e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 367 }, { "completion_length": 562.25, "epoch": 0.10199556541019955, "grad_norm": 0.2866198420524597, "kl": 0.010968263261020184, "learning_rate": 4.9972678264456846e-06, "loss": 0.0, "reward": 5.15625, "reward_std": 0.702488124370575, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 368 }, { "completion_length": 543.75, "epoch": 0.10227272727272728, "grad_norm": 0.0, "kl": 0.008264879696071148, "learning_rate": 4.9972473261644135e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 369 }, { "completion_length": 536.5, "epoch": 0.10254988913525499, "grad_norm": 0.28394559025764465, "kl": 0.00834739115089178, "learning_rate": 4.997226749302957e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 370 }, { "completion_length": 513.0, "epoch": 0.1028270509977827, "grad_norm": 0.294697642326355, "kl": 0.01112053357064724, "learning_rate": 4.997206095861944e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 371 }, { "completion_length": 724.75, "epoch": 0.10310421286031042, "grad_norm": 0.2654609978199005, "kl": 0.00890235137194395, "learning_rate": 4.997185365842008e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 372 }, { "completion_length": 515.25, "epoch": 0.10338137472283813, "grad_norm": 0.2891392111778259, "kl": 0.017710192129015923, "learning_rate": 4.997164559243785e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 373 }, { "completion_length": 559.75, "epoch": 0.10365853658536585, "grad_norm": 0.2877143621444702, "kl": 0.009405824355781078, "learning_rate": 4.997143676067913e-06, "loss": -0.0, "reward": 1.59375, "reward_std": 0.6875, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 374 }, { "completion_length": 599.0, "epoch": 0.10393569844789358, "grad_norm": 0.2525196969509125, "kl": 0.010500085540115833, "learning_rate": 4.997122716315032e-06, "loss": -0.0, "reward": 3.625, "reward_std": 2.462214469909668, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 375 }, { "completion_length": 567.75, "epoch": 0.10421286031042129, "grad_norm": 0.28292903304100037, "kl": 0.008447705768048763, "learning_rate": 4.997101679985784e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 376 }, { "completion_length": 564.25, "epoch": 0.104490022172949, "grad_norm": 0.30437031388282776, "kl": 0.008061797358095646, "learning_rate": 4.9970805670808174e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 377 }, { "completion_length": 585.25, "epoch": 0.10476718403547672, "grad_norm": 0.5474949479103088, "kl": 0.0072782705537974834, "learning_rate": 4.997059377600776e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 378 }, { "completion_length": 500.75, "epoch": 0.10504434589800443, "grad_norm": 0.2867445945739746, "kl": 0.0070358081720769405, "learning_rate": 4.9970381115463105e-06, "loss": -0.0, "reward": 2.84375, "reward_std": 1.9455478191375732, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 379 }, { "completion_length": 618.5, "epoch": 0.10532150776053215, "grad_norm": 0.26157820224761963, "kl": 0.032926157116889954, "learning_rate": 4.997016768918075e-06, "loss": -0.0, "reward": 4.34375, "reward_std": 1.8912490606307983, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 380 }, { "completion_length": 632.5, "epoch": 0.10559866962305987, "grad_norm": 0.2731629014015198, "kl": 0.008561252616345882, "learning_rate": 4.996995349716722e-06, "loss": -0.0, "reward": 3.34375, "reward_std": 1.846660852432251, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 381 }, { "completion_length": 587.25, "epoch": 0.10587583148558759, "grad_norm": 0.26181933283805847, "kl": 0.00872268620878458, "learning_rate": 4.996973853942908e-06, "loss": 0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 382 }, { "completion_length": 573.0, "epoch": 0.1061529933481153, "grad_norm": 0.0, "kl": 0.010241072624921799, "learning_rate": 4.996952281597294e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 383 }, { "completion_length": 560.5, "epoch": 0.10643015521064302, "grad_norm": 0.23241972923278809, "kl": 0.008187344297766685, "learning_rate": 4.996930632680541e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 384 }, { "completion_length": 595.25, "epoch": 0.10670731707317073, "grad_norm": 0.2234736829996109, "kl": 0.009712145663797855, "learning_rate": 4.996908907193311e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 385 }, { "completion_length": 463.75, "epoch": 0.10698447893569844, "grad_norm": 0.2810760736465454, "kl": 0.010606026276946068, "learning_rate": 4.996887105136273e-06, "loss": -0.0, "reward": 3.375, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 386 }, { "completion_length": 574.0, "epoch": 0.10726164079822616, "grad_norm": 0.24918359518051147, "kl": 0.01038257498294115, "learning_rate": 4.996865226510094e-06, "loss": -0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 387 }, { "completion_length": 579.5, "epoch": 0.10753880266075388, "grad_norm": 0.28346163034439087, "kl": 0.008679847232997417, "learning_rate": 4.9968432713154445e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 388 }, { "completion_length": 567.0, "epoch": 0.1078159645232816, "grad_norm": 0.2564204931259155, "kl": 0.008754831738770008, "learning_rate": 4.996821239552999e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 389 }, { "completion_length": 481.5, "epoch": 0.10809312638580931, "grad_norm": 0.29320141673088074, "kl": 0.010893772356212139, "learning_rate": 4.996799131223433e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 390 }, { "completion_length": 644.5, "epoch": 0.10837028824833703, "grad_norm": 0.2689422369003296, "kl": 0.008407491259276867, "learning_rate": 4.996776946327423e-06, "loss": -0.0, "reward": 3.21875, "reward_std": 1.7089684009552002, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 391 }, { "completion_length": 557.75, "epoch": 0.10864745011086474, "grad_norm": 0.362271785736084, "kl": 0.010499771684408188, "learning_rate": 4.996754684865651e-06, "loss": -0.0, "reward": 4.3125, "reward_std": 1.4772582054138184, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 392 }, { "completion_length": 645.25, "epoch": 0.10892461197339245, "grad_norm": 0.24999703466892242, "kl": 0.009168712422251701, "learning_rate": 4.9967323468388e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 393 }, { "completion_length": 464.25, "epoch": 0.10920177383592018, "grad_norm": 0.3000907301902771, "kl": 0.012932860292494297, "learning_rate": 4.996709932247553e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 394 }, { "completion_length": 598.5, "epoch": 0.1094789356984479, "grad_norm": 0.2674823999404907, "kl": 0.02078094519674778, "learning_rate": 4.996687441092598e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 395 }, { "completion_length": 556.75, "epoch": 0.10975609756097561, "grad_norm": 0.28585389256477356, "kl": 0.009022681973874569, "learning_rate": 4.996664873374626e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 396 }, { "completion_length": 549.5, "epoch": 0.11003325942350332, "grad_norm": 0.2722783386707306, "kl": 0.010802957229316235, "learning_rate": 4.9966422290943285e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 397 }, { "completion_length": 603.25, "epoch": 0.11031042128603104, "grad_norm": 0.2457823008298874, "kl": 0.013656833209097385, "learning_rate": 4.996619508252399e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 398 }, { "completion_length": 526.75, "epoch": 0.11058758314855875, "grad_norm": 0.2862286865711212, "kl": 0.013049881905317307, "learning_rate": 4.996596710849535e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 399 }, { "completion_length": 571.75, "epoch": 0.11086474501108648, "grad_norm": 0.2510634660720825, "kl": 0.008914025500416756, "learning_rate": 4.9965738368864345e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 400 }, { "completion_length": 562.5, "epoch": 0.1111419068736142, "grad_norm": 0.2769409418106079, "kl": 0.010706186294555664, "learning_rate": 4.996550886363801e-06, "loss": -0.0, "reward": 1.8125, "reward_std": 0.3145764470100403, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 401 }, { "completion_length": 527.5, "epoch": 0.11141906873614191, "grad_norm": 0.3236878514289856, "kl": 0.007896852679550648, "learning_rate": 4.996527859282337e-06, "loss": 0.0, "reward": 2.84375, "reward_std": 2.448160409927368, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 402 }, { "completion_length": 539.25, "epoch": 0.11169623059866962, "grad_norm": 0.30927106738090515, "kl": 0.011580236256122589, "learning_rate": 4.99650475564275e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 403 }, { "completion_length": 484.75, "epoch": 0.11197339246119734, "grad_norm": 0.3029186427593231, "kl": 0.012618943117558956, "learning_rate": 4.996481575445745e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 404 }, { "completion_length": 607.75, "epoch": 0.11225055432372505, "grad_norm": 0.0, "kl": 0.009084698744118214, "learning_rate": 4.996458318692037e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 405 }, { "completion_length": 550.0, "epoch": 0.11252771618625278, "grad_norm": 0.24101264774799347, "kl": 0.011917212046682835, "learning_rate": 4.996434985382337e-06, "loss": 0.0, "reward": 2.0, "reward_std": 0.5, "rewards/confident_score_func": -0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 406 }, { "completion_length": 551.75, "epoch": 0.11280487804878049, "grad_norm": 0.2669553756713867, "kl": 0.015854548662900925, "learning_rate": 4.99641157551736e-06, "loss": 0.0, "reward": 4.59375, "reward_std": 1.9185905456542969, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 407 }, { "completion_length": 427.5, "epoch": 0.1130820399113082, "grad_norm": 0.32891786098480225, "kl": 0.014153419993817806, "learning_rate": 4.996388089097826e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 408 }, { "completion_length": 490.0, "epoch": 0.11335920177383592, "grad_norm": 0.29333165287971497, "kl": 0.013264011591672897, "learning_rate": 4.996364526124453e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 409 }, { "completion_length": 566.25, "epoch": 0.11363636363636363, "grad_norm": 0.321139931678772, "kl": 0.010038134641945362, "learning_rate": 4.996340886597966e-06, "loss": -0.0, "reward": 4.0, "reward_std": 2.020725965499878, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 410 }, { "completion_length": 495.5, "epoch": 0.11391352549889135, "grad_norm": 0.2806561291217804, "kl": 0.011251037940382957, "learning_rate": 4.996317170519087e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.25, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 411 }, { "completion_length": 529.0, "epoch": 0.11419068736141907, "grad_norm": 0.3078991174697876, "kl": 0.011859896592795849, "learning_rate": 4.996293377888546e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 412 }, { "completion_length": 513.5, "epoch": 0.11446784922394679, "grad_norm": 0.3305742144584656, "kl": 0.011826644651591778, "learning_rate": 4.99626950870707e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 413 }, { "completion_length": 392.5, "epoch": 0.1147450110864745, "grad_norm": 0.32554683089256287, "kl": 0.045970603823661804, "learning_rate": 4.996245562975394e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 414 }, { "completion_length": 806.25, "epoch": 0.11502217294900222, "grad_norm": 0.2838173508644104, "kl": 0.02108391374349594, "learning_rate": 4.99622154069425e-06, "loss": -0.0, "reward": 2.21875, "reward_std": 2.7240729331970215, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 415 }, { "completion_length": 955.0, "epoch": 0.11529933481152993, "grad_norm": 0.24369527399539948, "kl": 0.00988956168293953, "learning_rate": 4.996197441864375e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 416 }, { "completion_length": 475.0, "epoch": 0.11557649667405764, "grad_norm": 0.28483524918556213, "kl": 0.00985225010663271, "learning_rate": 4.9961732664865085e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 417 }, { "completion_length": 480.25, "epoch": 0.11585365853658537, "grad_norm": 0.30248865485191345, "kl": 0.013049555942416191, "learning_rate": 4.996149014561392e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 418 }, { "completion_length": 635.25, "epoch": 0.11613082039911309, "grad_norm": 0.23825614154338837, "kl": 0.00919595081359148, "learning_rate": 4.996124686089769e-06, "loss": -0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 419 }, { "completion_length": 675.0, "epoch": 0.1164079822616408, "grad_norm": 0.2727813124656677, "kl": 0.013185232877731323, "learning_rate": 4.996100281072385e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 420 }, { "completion_length": 593.5, "epoch": 0.11668514412416851, "grad_norm": 0.3139597773551941, "kl": 0.015473646111786366, "learning_rate": 4.9960757995099895e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 421 }, { "completion_length": 556.0, "epoch": 0.11696230598669623, "grad_norm": 0.2492779642343521, "kl": 0.010901713743805885, "learning_rate": 4.9960512414033325e-06, "loss": -0.0, "reward": 3.375, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 422 }, { "completion_length": 655.0, "epoch": 0.11723946784922394, "grad_norm": 0.2776035666465759, "kl": 0.008187444880604744, "learning_rate": 4.996026606753167e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 423 }, { "completion_length": 572.25, "epoch": 0.11751662971175167, "grad_norm": 0.2734625041484833, "kl": 0.011468918062746525, "learning_rate": 4.996001895560249e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 424 }, { "completion_length": 560.5, "epoch": 0.11779379157427938, "grad_norm": 0.27693501114845276, "kl": 0.01610645465552807, "learning_rate": 4.995977107825336e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 425 }, { "completion_length": 526.5, "epoch": 0.1180709534368071, "grad_norm": 0.3647044599056244, "kl": 0.03527112305164337, "learning_rate": 4.995952243549188e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 426 }, { "completion_length": 562.5, "epoch": 0.11834811529933481, "grad_norm": 0.29632264375686646, "kl": 0.011425329372286797, "learning_rate": 4.9959273027325675e-06, "loss": 0.0, "reward": 3.40625, "reward_std": 2.1731672286987305, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 427 }, { "completion_length": 491.5, "epoch": 0.11862527716186252, "grad_norm": 0.2910871207714081, "kl": 0.011075750924646854, "learning_rate": 4.99590228537624e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 428 }, { "completion_length": 699.0, "epoch": 0.11890243902439024, "grad_norm": 0.2638415992259979, "kl": 0.011031696572899818, "learning_rate": 4.995877191480971e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 429 }, { "completion_length": 553.75, "epoch": 0.11917960088691797, "grad_norm": 0.2916988432407379, "kl": 0.01419891882687807, "learning_rate": 4.9958520210475315e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 430 }, { "completion_length": 516.25, "epoch": 0.11945676274944568, "grad_norm": 0.24851743876934052, "kl": 0.012266779318451881, "learning_rate": 4.995826774076693e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 431 }, { "completion_length": 485.25, "epoch": 0.1197339246119734, "grad_norm": 0.30134379863739014, "kl": 0.013474212028086185, "learning_rate": 4.99580145056923e-06, "loss": -0.0, "reward": 3.875, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 432 }, { "completion_length": 529.0, "epoch": 0.12001108647450111, "grad_norm": 0.3121477961540222, "kl": 0.02145874686539173, "learning_rate": 4.995776050525919e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 433 }, { "completion_length": 589.75, "epoch": 0.12028824833702882, "grad_norm": 0.2440590262413025, "kl": 0.011722201481461525, "learning_rate": 4.995750573947538e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 434 }, { "completion_length": 533.5, "epoch": 0.12056541019955654, "grad_norm": 0.0, "kl": 0.014227804727852345, "learning_rate": 4.9957250208348696e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 435 }, { "completion_length": 633.75, "epoch": 0.12084257206208426, "grad_norm": 0.2628498375415802, "kl": 0.00986568070948124, "learning_rate": 4.995699391188696e-06, "loss": -0.0, "reward": 2.5625, "reward_std": 1.4912941455841064, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 436 }, { "completion_length": 478.25, "epoch": 0.12111973392461198, "grad_norm": 0.2878575325012207, "kl": 0.013845988549292088, "learning_rate": 4.995673685009806e-06, "loss": 0.0, "reward": 3.375, "reward_std": 1.7969882488250732, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 437 }, { "completion_length": 588.75, "epoch": 0.12139689578713969, "grad_norm": 0.38887572288513184, "kl": 0.011323816142976284, "learning_rate": 4.9956479022989836e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 438 }, { "completion_length": 601.75, "epoch": 0.1216740576496674, "grad_norm": 0.2575204372406006, "kl": 0.011730782687664032, "learning_rate": 4.995622043057023e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 439 }, { "completion_length": 528.5, "epoch": 0.12195121951219512, "grad_norm": 0.2786175310611725, "kl": 0.012202276848256588, "learning_rate": 4.9955961072847145e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 440 }, { "completion_length": 554.0, "epoch": 0.12222838137472283, "grad_norm": 0.3509441912174225, "kl": 0.011919002048671246, "learning_rate": 4.995570094982856e-06, "loss": -0.0, "reward": 2.59375, "reward_std": 1.6875, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 441 }, { "completion_length": 560.75, "epoch": 0.12250554323725056, "grad_norm": 0.3583238422870636, "kl": 0.014090409502387047, "learning_rate": 4.995544006152243e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 442 }, { "completion_length": 568.5, "epoch": 0.12278270509977827, "grad_norm": 0.31855615973472595, "kl": 0.013690127059817314, "learning_rate": 4.995517840793677e-06, "loss": 0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 443 }, { "completion_length": 943.75, "epoch": 0.12305986696230599, "grad_norm": 0.25765693187713623, "kl": 0.01062240544706583, "learning_rate": 4.99549159890796e-06, "loss": 0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 444 }, { "completion_length": 621.25, "epoch": 0.1233370288248337, "grad_norm": 0.255987286567688, "kl": 0.011262648738920689, "learning_rate": 4.995465280495897e-06, "loss": 0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 445 }, { "completion_length": 525.0, "epoch": 0.12361419068736142, "grad_norm": 0.3379601240158081, "kl": 0.011584070511162281, "learning_rate": 4.995438885558294e-06, "loss": -0.0, "reward": 5.0, "reward_std": 0.9574271440505981, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 446 }, { "completion_length": 608.5, "epoch": 0.12389135254988913, "grad_norm": 0.0, "kl": 0.013573270291090012, "learning_rate": 4.995412414095961e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 447 }, { "completion_length": 570.5, "epoch": 0.12416851441241686, "grad_norm": 0.27646541595458984, "kl": 0.014613611623644829, "learning_rate": 4.995385866109711e-06, "loss": 0.0, "reward": 1.71875, "reward_std": 0.46069467067718506, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 448 }, { "completion_length": 555.75, "epoch": 0.12444567627494457, "grad_norm": 0.30039241909980774, "kl": 0.016051331534981728, "learning_rate": 4.995359241600357e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 449 }, { "completion_length": 475.75, "epoch": 0.12472283813747229, "grad_norm": 0.3888663649559021, "kl": 0.01018838956952095, "learning_rate": 4.995332540568715e-06, "loss": -0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 450 }, { "completion_length": 586.5, "epoch": 0.125, "grad_norm": 0.2376839518547058, "kl": 0.012960893101990223, "learning_rate": 4.995305763015604e-06, "loss": 0.0, "reward": 5.625, "reward_std": 0.25, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 451 }, { "completion_length": 535.5, "epoch": 0.12527716186252771, "grad_norm": 0.3031328618526459, "kl": 0.022141581401228905, "learning_rate": 4.995278908941845e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 452 }, { "completion_length": 556.5, "epoch": 0.12555432372505543, "grad_norm": 0.25992169976234436, "kl": 0.015524713322520256, "learning_rate": 4.995251978348263e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 453 }, { "completion_length": 484.0, "epoch": 0.12583148558758314, "grad_norm": 0.29005125164985657, "kl": 0.01509784534573555, "learning_rate": 4.995224971235683e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 454 }, { "completion_length": 513.0, "epoch": 0.12610864745011086, "grad_norm": 0.3984679877758026, "kl": 0.0152598200365901, "learning_rate": 4.995197887604932e-06, "loss": 0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 455 }, { "completion_length": 645.25, "epoch": 0.12638580931263857, "grad_norm": 0.24016231298446655, "kl": 0.013699004426598549, "learning_rate": 4.995170727456842e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 456 }, { "completion_length": 617.75, "epoch": 0.12666297117516628, "grad_norm": 0.2645907700061798, "kl": 0.01106535829603672, "learning_rate": 4.995143490792246e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 457 }, { "completion_length": 646.75, "epoch": 0.12694013303769403, "grad_norm": 0.26844871044158936, "kl": 0.01189558394253254, "learning_rate": 4.995116177611978e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 458 }, { "completion_length": 604.0, "epoch": 0.12721729490022174, "grad_norm": 0.24473167955875397, "kl": 0.01646513119339943, "learning_rate": 4.995088787916877e-06, "loss": 0.0, "reward": 2.96875, "reward_std": 1.8662992715835571, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 459 }, { "completion_length": 604.0, "epoch": 0.12749445676274945, "grad_norm": 0.26254117488861084, "kl": 0.010784992948174477, "learning_rate": 4.995061321707781e-06, "loss": 0.0, "reward": 2.75, "reward_std": 1.3540064096450806, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 460 }, { "completion_length": 484.75, "epoch": 0.12777161862527717, "grad_norm": 0.30859071016311646, "kl": 0.016539964824914932, "learning_rate": 4.995033778985534e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 461 }, { "completion_length": 582.75, "epoch": 0.12804878048780488, "grad_norm": 0.30148911476135254, "kl": 0.016737353056669235, "learning_rate": 4.99500615975098e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 462 }, { "completion_length": 516.0, "epoch": 0.1283259423503326, "grad_norm": 0.3810352087020874, "kl": 0.01474794466048479, "learning_rate": 4.994978464004967e-06, "loss": -0.0, "reward": 3.375, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 463 }, { "completion_length": 513.75, "epoch": 0.1286031042128603, "grad_norm": 0.3015996217727661, "kl": 0.017074020579457283, "learning_rate": 4.994950691748342e-06, "loss": -0.0, "reward": 1.59375, "reward_std": 0.3125, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 464 }, { "completion_length": 606.75, "epoch": 0.12888026607538802, "grad_norm": 0.0, "kl": 0.014301312156021595, "learning_rate": 4.994922842981958e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 465 }, { "completion_length": 520.5, "epoch": 0.12915742793791574, "grad_norm": 0.3250337839126587, "kl": 0.012701382860541344, "learning_rate": 4.994894917706671e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 466 }, { "completion_length": 544.5, "epoch": 0.12943458980044345, "grad_norm": 0.2486903965473175, "kl": 0.01508486270904541, "learning_rate": 4.9948669159233334e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 467 }, { "completion_length": 588.75, "epoch": 0.12971175166297116, "grad_norm": 0.25133371353149414, "kl": 0.01142136100679636, "learning_rate": 4.994838837632807e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 468 }, { "completion_length": 581.25, "epoch": 0.12998891352549888, "grad_norm": 0.0, "kl": 0.012077090330421925, "learning_rate": 4.994810682835951e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 469 }, { "completion_length": 576.0, "epoch": 0.13026607538802662, "grad_norm": 0.2636439800262451, "kl": 0.012037253938615322, "learning_rate": 4.99478245153363e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 470 }, { "completion_length": 526.75, "epoch": 0.13054323725055433, "grad_norm": 0.2847610116004944, "kl": 0.019691606983542442, "learning_rate": 4.994754143726709e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 471 }, { "completion_length": 583.75, "epoch": 0.13082039911308205, "grad_norm": 0.24681171774864197, "kl": 0.011356879957020283, "learning_rate": 4.9947257594160556e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 472 }, { "completion_length": 510.5, "epoch": 0.13109756097560976, "grad_norm": 0.28339627385139465, "kl": 0.013071756809949875, "learning_rate": 4.994697298602542e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 473 }, { "completion_length": 598.0, "epoch": 0.13137472283813748, "grad_norm": 0.32244178652763367, "kl": 0.021649716421961784, "learning_rate": 4.9946687612870394e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 474 }, { "completion_length": 632.0, "epoch": 0.1316518847006652, "grad_norm": 0.0, "kl": 0.01582184061408043, "learning_rate": 4.994640147470424e-06, "loss": 0.0, "reward": 4.75, "reward_std": 0.0, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 475 }, { "completion_length": 698.0, "epoch": 0.1319290465631929, "grad_norm": 0.236092209815979, "kl": 0.026537545025348663, "learning_rate": 4.994611457153572e-06, "loss": -0.0, "reward": 3.71875, "reward_std": 2.390029191970825, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 476 }, { "completion_length": 575.25, "epoch": 0.13220620842572062, "grad_norm": 0.23448577523231506, "kl": 0.013198057189583778, "learning_rate": 4.994582690337365e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 477 }, { "completion_length": 595.0, "epoch": 0.13248337028824833, "grad_norm": 0.24670638144016266, "kl": 0.01292665395885706, "learning_rate": 4.994553847022683e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 478 }, { "completion_length": 590.5, "epoch": 0.13276053215077604, "grad_norm": 0.24097318947315216, "kl": 0.011891084723174572, "learning_rate": 4.994524927210412e-06, "loss": -0.0, "reward": 3.375, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 479 }, { "completion_length": 458.25, "epoch": 0.13303769401330376, "grad_norm": 0.0, "kl": 0.017252717167139053, "learning_rate": 4.994495930901438e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 480 }, { "completion_length": 580.0, "epoch": 0.13331485587583147, "grad_norm": 0.2571314871311188, "kl": 0.013605281710624695, "learning_rate": 4.994466858096652e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 481 }, { "completion_length": 573.0, "epoch": 0.13359201773835921, "grad_norm": 0.24764439463615417, "kl": 0.014911593869328499, "learning_rate": 4.994437708796943e-06, "loss": 0.0, "reward": 4.0, "reward_std": 1.5, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 482 }, { "completion_length": 582.0, "epoch": 0.13386917960088693, "grad_norm": 0.2913679778575897, "kl": 0.013406328856945038, "learning_rate": 4.9944084830032055e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 483 }, { "completion_length": 607.75, "epoch": 0.13414634146341464, "grad_norm": 0.28370359539985657, "kl": 0.01871001161634922, "learning_rate": 4.994379180716338e-06, "loss": -0.0, "reward": 2.59375, "reward_std": 1.4626424312591553, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 484 }, { "completion_length": 653.0, "epoch": 0.13442350332594236, "grad_norm": 0.2517474293708801, "kl": 0.012899363413453102, "learning_rate": 4.994349801937236e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 485 }, { "completion_length": 582.0, "epoch": 0.13470066518847007, "grad_norm": 0.2633453607559204, "kl": 0.011680581606924534, "learning_rate": 4.994320346666803e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 486 }, { "completion_length": 543.0, "epoch": 0.13497782705099778, "grad_norm": 0.2910776138305664, "kl": 0.011736645363271236, "learning_rate": 4.9942908149059395e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 487 }, { "completion_length": 536.25, "epoch": 0.1352549889135255, "grad_norm": 0.27956756949424744, "kl": 0.011400609277188778, "learning_rate": 4.994261206655554e-06, "loss": 0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 488 }, { "completion_length": 551.0, "epoch": 0.1355321507760532, "grad_norm": 0.26625487208366394, "kl": 0.01121117826551199, "learning_rate": 4.994231521916553e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 489 }, { "completion_length": 401.0, "epoch": 0.13580931263858093, "grad_norm": 0.33662980794906616, "kl": 0.024227041751146317, "learning_rate": 4.994201760689847e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 490 }, { "completion_length": 473.25, "epoch": 0.13608647450110864, "grad_norm": 0.3164840638637543, "kl": 0.015716426074504852, "learning_rate": 4.994171922976349e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 491 }, { "completion_length": 578.0, "epoch": 0.13636363636363635, "grad_norm": 0.24282212555408478, "kl": 0.011174133978784084, "learning_rate": 4.994142008776972e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 492 }, { "completion_length": 880.0, "epoch": 0.13664079822616407, "grad_norm": 0.19305139780044556, "kl": 0.011374962516129017, "learning_rate": 4.994112018092636e-06, "loss": -0.0, "reward": 2.21875, "reward_std": 2.7240729331970215, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 493 }, { "completion_length": 496.5, "epoch": 0.1369179600886918, "grad_norm": 0.2880307137966156, "kl": 0.011024456471204758, "learning_rate": 4.99408195092426e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 494 }, { "completion_length": 451.5, "epoch": 0.13719512195121952, "grad_norm": 0.30747029185295105, "kl": 0.01718844100832939, "learning_rate": 4.994051807272765e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 495 }, { "completion_length": 454.0, "epoch": 0.13747228381374724, "grad_norm": 0.3002479076385498, "kl": 0.012612885795533657, "learning_rate": 4.9940215871390765e-06, "loss": -0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 496 }, { "completion_length": 600.0, "epoch": 0.13774944567627495, "grad_norm": 0.2661634385585785, "kl": 0.012211484834551811, "learning_rate": 4.9939912905241215e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 497 }, { "completion_length": 524.75, "epoch": 0.13802660753880266, "grad_norm": 0.0, "kl": 0.011420011520385742, "learning_rate": 4.993960917428827e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 498 }, { "completion_length": 601.75, "epoch": 0.13830376940133038, "grad_norm": 0.22552883625030518, "kl": 0.013109054416418076, "learning_rate": 4.993930467854127e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 499 }, { "completion_length": 602.5, "epoch": 0.1385809312638581, "grad_norm": 0.2338850349187851, "kl": 0.015728723257780075, "learning_rate": 4.993899941800953e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 500 }, { "completion_length": 556.25, "epoch": 0.1388580931263858, "grad_norm": 0.29303935170173645, "kl": 0.010847372002899647, "learning_rate": 4.993869339270242e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 501 }, { "completion_length": 435.25, "epoch": 0.13913525498891352, "grad_norm": 0.4005419909954071, "kl": 0.03049500845372677, "learning_rate": 4.993838660262934e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 502 }, { "completion_length": 492.75, "epoch": 0.13941241685144123, "grad_norm": 0.2644980549812317, "kl": 0.016365667805075645, "learning_rate": 4.993807904779967e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 503 }, { "completion_length": 589.0, "epoch": 0.13968957871396895, "grad_norm": 0.29014119505882263, "kl": 0.01156685221940279, "learning_rate": 4.993777072822286e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 504 }, { "completion_length": 556.25, "epoch": 0.13996674057649666, "grad_norm": 0.30305659770965576, "kl": 0.012550624087452888, "learning_rate": 4.993746164390836e-06, "loss": 0.0, "reward": 3.5, "reward_std": 1.443375587463379, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 505 }, { "completion_length": 540.0, "epoch": 0.1402439024390244, "grad_norm": 0.26109403371810913, "kl": 0.01459554210305214, "learning_rate": 4.993715179486565e-06, "loss": -0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 506 }, { "completion_length": 492.25, "epoch": 0.14052106430155212, "grad_norm": 0.28578728437423706, "kl": 0.016000347211956978, "learning_rate": 4.993684118110424e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 507 }, { "completion_length": 557.5, "epoch": 0.14079822616407983, "grad_norm": 0.5610048770904541, "kl": 0.03864310681819916, "learning_rate": 4.993652980263362e-06, "loss": 0.0, "reward": 4.71875, "reward_std": 1.6719967126846313, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 508 }, { "completion_length": 447.25, "epoch": 0.14107538802660755, "grad_norm": 0.3150566816329956, "kl": 0.011573800817131996, "learning_rate": 4.993621765946339e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 509 }, { "completion_length": 557.0, "epoch": 0.14135254988913526, "grad_norm": 0.28059470653533936, "kl": 0.013491615653038025, "learning_rate": 4.993590475160308e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 510 }, { "completion_length": 510.75, "epoch": 0.14162971175166297, "grad_norm": 0.2620765268802643, "kl": 0.015655498951673508, "learning_rate": 4.993559107906232e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 511 }, { "completion_length": 523.25, "epoch": 0.1419068736141907, "grad_norm": 0.3310132622718811, "kl": 0.013490190729498863, "learning_rate": 4.993527664185069e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 512 }, { "completion_length": 451.5, "epoch": 0.1421840354767184, "grad_norm": 0.3757499158382416, "kl": 0.01459009014070034, "learning_rate": 4.993496143997787e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 513 }, { "completion_length": 493.75, "epoch": 0.14246119733924612, "grad_norm": 0.26614710688591003, "kl": 0.012076952494680882, "learning_rate": 4.9934645473453505e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 514 }, { "completion_length": 525.5, "epoch": 0.14273835920177383, "grad_norm": 0.2564806640148163, "kl": 0.009123008698225021, "learning_rate": 4.9934328742287285e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 515 }, { "completion_length": 557.25, "epoch": 0.14301552106430154, "grad_norm": 0.23396946489810944, "kl": 0.009820730425417423, "learning_rate": 4.993401124648892e-06, "loss": -0.0, "reward": 3.125, "reward_std": 1.75, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 516 }, { "completion_length": 494.5, "epoch": 0.14329268292682926, "grad_norm": 0.32461264729499817, "kl": 0.011900431476533413, "learning_rate": 4.993369298606817e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 517 }, { "completion_length": 532.25, "epoch": 0.14356984478935697, "grad_norm": 0.3295058012008667, "kl": 0.013253359124064445, "learning_rate": 4.993337396103477e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 518 }, { "completion_length": 553.0, "epoch": 0.1438470066518847, "grad_norm": 0.29310572147369385, "kl": 0.014249798841774464, "learning_rate": 4.993305417139851e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 519 }, { "completion_length": 466.25, "epoch": 0.14412416851441243, "grad_norm": 0.28870734572410583, "kl": 0.012930367141962051, "learning_rate": 4.99327336171692e-06, "loss": -0.0, "reward": 4.375, "reward_std": 1.4930394887924194, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 520 }, { "completion_length": 617.5, "epoch": 0.14440133037694014, "grad_norm": 0.27473852038383484, "kl": 0.017847536131739616, "learning_rate": 4.993241229835666e-06, "loss": 0.0, "reward": 1.6875, "reward_std": 0.5153881907463074, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 521 }, { "completion_length": 492.5, "epoch": 0.14467849223946785, "grad_norm": 0.0, "kl": 0.014758508652448654, "learning_rate": 4.993209021497075e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 522 }, { "completion_length": 467.0, "epoch": 0.14495565410199557, "grad_norm": 0.37473565340042114, "kl": 0.016591371968388557, "learning_rate": 4.993176736702136e-06, "loss": 0.0, "reward": 4.375, "reward_std": 1.4930394887924194, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 523 }, { "completion_length": 568.25, "epoch": 0.14523281596452328, "grad_norm": 0.2429487705230713, "kl": 0.010972530581057072, "learning_rate": 4.993144375451837e-06, "loss": -0.0, "reward": 3.78125, "reward_std": 2.0725762844085693, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 524 }, { "completion_length": 964.0, "epoch": 0.145509977827051, "grad_norm": 0.2238660603761673, "kl": 0.010286291129887104, "learning_rate": 4.993111937747171e-06, "loss": -0.0, "reward": 2.09375, "reward_std": 2.7336158752441406, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 525 }, { "completion_length": 550.75, "epoch": 0.1457871396895787, "grad_norm": 0.23857972025871277, "kl": 0.010239925235509872, "learning_rate": 4.993079423589134e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 526 }, { "completion_length": 473.75, "epoch": 0.14606430155210642, "grad_norm": 0.35116782784461975, "kl": 0.013546721078455448, "learning_rate": 4.993046832978722e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 527 }, { "completion_length": 554.0, "epoch": 0.14634146341463414, "grad_norm": 0.2581903636455536, "kl": 0.008545651100575924, "learning_rate": 4.993014165916934e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 528 }, { "completion_length": 583.0, "epoch": 0.14661862527716185, "grad_norm": 0.262373685836792, "kl": 0.01125812716782093, "learning_rate": 4.992981422404772e-06, "loss": -0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 529 }, { "completion_length": 616.75, "epoch": 0.14689578713968957, "grad_norm": 0.2554208040237427, "kl": 0.012226797640323639, "learning_rate": 4.9929486024432405e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 530 }, { "completion_length": 454.25, "epoch": 0.1471729490022173, "grad_norm": 0.3536977469921112, "kl": 0.01164777297526598, "learning_rate": 4.992915706033345e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 531 }, { "completion_length": 604.75, "epoch": 0.14745011086474502, "grad_norm": 0.244077667593956, "kl": 0.00902183260768652, "learning_rate": 4.9928827331760965e-06, "loss": -0.0, "reward": 3.375, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 532 }, { "completion_length": 617.25, "epoch": 0.14772727272727273, "grad_norm": 0.255219429731369, "kl": 0.008280010893940926, "learning_rate": 4.992849683872504e-06, "loss": -0.0, "reward": 4.375, "reward_std": 1.4930394887924194, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 533 }, { "completion_length": 560.75, "epoch": 0.14800443458980045, "grad_norm": 0.2375599592924118, "kl": 0.008287557400763035, "learning_rate": 4.992816558123581e-06, "loss": -0.0, "reward": 4.4375, "reward_std": 1.6754974126815796, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 534 }, { "completion_length": 517.5, "epoch": 0.14828159645232816, "grad_norm": 0.26907461881637573, "kl": 0.015090258792042732, "learning_rate": 4.992783355930345e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 535 }, { "completion_length": 454.75, "epoch": 0.14855875831485588, "grad_norm": 0.29815760254859924, "kl": 0.011481473222374916, "learning_rate": 4.992750077293813e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 536 }, { "completion_length": 514.0, "epoch": 0.1488359201773836, "grad_norm": 0.2734052538871765, "kl": 0.010894633829593658, "learning_rate": 4.992716722215006e-06, "loss": 0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 537 }, { "completion_length": 452.0, "epoch": 0.1491130820399113, "grad_norm": 0.32846489548683167, "kl": 0.012017915956676006, "learning_rate": 4.992683290694946e-06, "loss": 0.0, "reward": 3.65625, "reward_std": 2.4224965572357178, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 538 }, { "completion_length": 463.75, "epoch": 0.14939024390243902, "grad_norm": 0.3012801706790924, "kl": 0.011615828610956669, "learning_rate": 4.992649782734659e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 539 }, { "completion_length": 530.5, "epoch": 0.14966740576496673, "grad_norm": 0.2570907771587372, "kl": 0.020741106942296028, "learning_rate": 4.992616198335173e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 540 }, { "completion_length": 530.75, "epoch": 0.14994456762749445, "grad_norm": 0.2735753059387207, "kl": 0.010087276808917522, "learning_rate": 4.992582537497516e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 541 }, { "completion_length": 597.5, "epoch": 0.15022172949002216, "grad_norm": 0.2527350187301636, "kl": 0.008352112025022507, "learning_rate": 4.9925488002227214e-06, "loss": 0.0, "reward": 2.75, "reward_std": 1.0, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 542 }, { "completion_length": 544.5, "epoch": 0.1504988913525499, "grad_norm": 0.28268077969551086, "kl": 0.010395168326795101, "learning_rate": 4.992514986511825e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 543 }, { "completion_length": 496.0, "epoch": 0.15077605321507762, "grad_norm": 0.2935279905796051, "kl": 0.011516474187374115, "learning_rate": 4.992481096365862e-06, "loss": -0.0, "reward": 3.90625, "reward_std": 2.13447642326355, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 544 }, { "completion_length": 514.0, "epoch": 0.15105321507760533, "grad_norm": 0.0, "kl": 0.00806362647563219, "learning_rate": 4.992447129785872e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 545 }, { "completion_length": 559.5, "epoch": 0.15133037694013304, "grad_norm": 0.30391356348991394, "kl": 0.011520025320351124, "learning_rate": 4.992413086772897e-06, "loss": 0.0, "reward": 4.375, "reward_std": 1.4930394887924194, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 546 }, { "completion_length": 526.25, "epoch": 0.15160753880266076, "grad_norm": 0.0, "kl": 0.010381520725786686, "learning_rate": 4.992378967327981e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 547 }, { "completion_length": 659.75, "epoch": 0.15188470066518847, "grad_norm": 0.18137741088867188, "kl": 0.007755795959383249, "learning_rate": 4.99234477145217e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 548 }, { "completion_length": 487.75, "epoch": 0.15216186252771619, "grad_norm": 0.26786279678344727, "kl": 0.010686397552490234, "learning_rate": 4.9923104991465135e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 549 }, { "completion_length": 466.25, "epoch": 0.1524390243902439, "grad_norm": 0.33955177664756775, "kl": 0.011687343008816242, "learning_rate": 4.99227615041206e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 550 }, { "completion_length": 460.0, "epoch": 0.1527161862527716, "grad_norm": 0.2585194706916809, "kl": 0.011075504124164581, "learning_rate": 4.992241725249866e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 551 }, { "completion_length": 472.5, "epoch": 0.15299334811529933, "grad_norm": 0.34748926758766174, "kl": 0.011733915656805038, "learning_rate": 4.992207223660985e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 552 }, { "completion_length": 488.0, "epoch": 0.15327050997782704, "grad_norm": 0.3018478751182556, "kl": 0.012888339348137379, "learning_rate": 4.992172645646477e-06, "loss": 0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 553 }, { "completion_length": 486.5, "epoch": 0.15354767184035475, "grad_norm": 0.32259300351142883, "kl": 0.008902732282876968, "learning_rate": 4.9921379912074e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 554 }, { "completion_length": 532.75, "epoch": 0.1538248337028825, "grad_norm": 0.3468951880931854, "kl": 0.023253489285707474, "learning_rate": 4.992103260344818e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 555 }, { "completion_length": 528.0, "epoch": 0.1541019955654102, "grad_norm": 0.31509578227996826, "kl": 0.008939484134316444, "learning_rate": 4.992068453059796e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 556 }, { "completion_length": 522.25, "epoch": 0.15437915742793792, "grad_norm": 0.26434117555618286, "kl": 0.011230683885514736, "learning_rate": 4.9920335693534016e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 557 }, { "completion_length": 467.25, "epoch": 0.15465631929046564, "grad_norm": 0.3097941279411316, "kl": 0.013999595306813717, "learning_rate": 4.9919986092267034e-06, "loss": -0.0, "reward": 1.625, "reward_std": 0.25, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 558 }, { "completion_length": 521.0, "epoch": 0.15493348115299335, "grad_norm": 0.3490343391895294, "kl": 0.009827574715018272, "learning_rate": 4.991963572680775e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 559 }, { "completion_length": 544.0, "epoch": 0.15521064301552107, "grad_norm": 0.25716906785964966, "kl": 0.01039281114935875, "learning_rate": 4.99192845971669e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 560 }, { "completion_length": 656.0, "epoch": 0.15548780487804878, "grad_norm": 0.28156575560569763, "kl": 0.0088484613224864, "learning_rate": 4.991893270335526e-06, "loss": -0.0, "reward": 3.0, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 561 }, { "completion_length": 553.25, "epoch": 0.1557649667405765, "grad_norm": 0.24885910749435425, "kl": 0.010102292522788048, "learning_rate": 4.99185800453836e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 562 }, { "completion_length": 592.75, "epoch": 0.1560421286031042, "grad_norm": 0.23886214196681976, "kl": 0.008565710857510567, "learning_rate": 4.991822662326276e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 563 }, { "completion_length": 493.0, "epoch": 0.15631929046563192, "grad_norm": 0.28623223304748535, "kl": 0.011345049366354942, "learning_rate": 4.9917872437003554e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 564 }, { "completion_length": 524.5, "epoch": 0.15659645232815964, "grad_norm": 0.32436177134513855, "kl": 0.009614119306206703, "learning_rate": 4.991751748661687e-06, "loss": 0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 565 }, { "completion_length": 507.0, "epoch": 0.15687361419068735, "grad_norm": 0.257015585899353, "kl": 0.011903928592801094, "learning_rate": 4.991716177211357e-06, "loss": -0.0, "reward": 5.0, "reward_std": 0.5, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 566 }, { "completion_length": 518.5, "epoch": 0.1571507760532151, "grad_norm": 0.26851359009742737, "kl": 0.021136531606316566, "learning_rate": 4.991680529350458e-06, "loss": 0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 567 }, { "completion_length": 548.75, "epoch": 0.1574279379157428, "grad_norm": 0.28408876061439514, "kl": 0.011704757809638977, "learning_rate": 4.9916448050800815e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 568 }, { "completion_length": 567.75, "epoch": 0.15770509977827052, "grad_norm": 0.30359533429145813, "kl": 0.018905548378825188, "learning_rate": 4.991609004401324e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 569 }, { "completion_length": 575.25, "epoch": 0.15798226164079823, "grad_norm": 0.297054648399353, "kl": 0.009861034341156483, "learning_rate": 4.991573127315284e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 570 }, { "completion_length": 540.75, "epoch": 0.15825942350332595, "grad_norm": 0.27634549140930176, "kl": 0.01018818560987711, "learning_rate": 4.99153717382306e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 571 }, { "completion_length": 611.5, "epoch": 0.15853658536585366, "grad_norm": 0.30012089014053345, "kl": 0.011986632831394672, "learning_rate": 4.991501143925755e-06, "loss": -0.0, "reward": 4.0, "reward_std": 2.020725965499878, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 572 }, { "completion_length": 499.75, "epoch": 0.15881374722838137, "grad_norm": 0.2884584665298462, "kl": 0.012602191418409348, "learning_rate": 4.991465037624475e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.973786473274231, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 573 }, { "completion_length": 521.75, "epoch": 0.1590909090909091, "grad_norm": 0.307699054479599, "kl": 0.009974922984838486, "learning_rate": 4.991428854920327e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 574 }, { "completion_length": 485.75, "epoch": 0.1593680709534368, "grad_norm": 0.29446256160736084, "kl": 0.013076583854854107, "learning_rate": 4.991392595814419e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 575 }, { "completion_length": 480.0, "epoch": 0.15964523281596452, "grad_norm": 0.0, "kl": 0.016554081812500954, "learning_rate": 4.991356260307865e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 576 }, { "completion_length": 473.5, "epoch": 0.15992239467849223, "grad_norm": 0.28367477655410767, "kl": 0.00996217131614685, "learning_rate": 4.991319848401777e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 577 }, { "completion_length": 435.0, "epoch": 0.16019955654101994, "grad_norm": 0.3161223232746124, "kl": 0.012582499533891678, "learning_rate": 4.991283360097275e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 578 }, { "completion_length": 436.0, "epoch": 0.16047671840354769, "grad_norm": 0.30917447805404663, "kl": 0.013670515269041061, "learning_rate": 4.991246795395473e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 579 }, { "completion_length": 473.5, "epoch": 0.1607538802660754, "grad_norm": 0.0, "kl": 0.013224879279732704, "learning_rate": 4.991210154297497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 580 }, { "completion_length": 562.25, "epoch": 0.1610310421286031, "grad_norm": 0.3973218500614166, "kl": 0.02502482570707798, "learning_rate": 4.991173436804468e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 581 }, { "completion_length": 530.0, "epoch": 0.16130820399113083, "grad_norm": 0.2802339792251587, "kl": 0.021899983286857605, "learning_rate": 4.991136642917514e-06, "loss": -0.0, "reward": 1.59375, "reward_std": 0.3125, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 582 }, { "completion_length": 570.75, "epoch": 0.16158536585365854, "grad_norm": 0.251929372549057, "kl": 0.013681888580322266, "learning_rate": 4.991099772637761e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 583 }, { "completion_length": 660.0, "epoch": 0.16186252771618626, "grad_norm": 0.0, "kl": 0.010052117519080639, "learning_rate": 4.991062825966341e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 584 }, { "completion_length": 466.75, "epoch": 0.16213968957871397, "grad_norm": 0.266069620847702, "kl": 0.013937646523118019, "learning_rate": 4.991025802904386e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 585 }, { "completion_length": 588.75, "epoch": 0.16241685144124168, "grad_norm": 0.258294016122818, "kl": 0.02581559307873249, "learning_rate": 4.9909887034530325e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 586 }, { "completion_length": 603.5, "epoch": 0.1626940133037694, "grad_norm": 0.2175741195678711, "kl": 0.01316547580063343, "learning_rate": 4.990951527613418e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 587 }, { "completion_length": 495.75, "epoch": 0.1629711751662971, "grad_norm": 0.2710520625114441, "kl": 0.013409608043730259, "learning_rate": 4.9909142753866826e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 588 }, { "completion_length": 540.75, "epoch": 0.16324833702882482, "grad_norm": 0.2405608594417572, "kl": 0.014538454823195934, "learning_rate": 4.990876946773967e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 589 }, { "completion_length": 495.25, "epoch": 0.16352549889135254, "grad_norm": 0.0, "kl": 0.013090737164020538, "learning_rate": 4.990839541776417e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 590 }, { "completion_length": 500.25, "epoch": 0.16380266075388025, "grad_norm": 0.36328843235969543, "kl": 0.011497433297336102, "learning_rate": 4.990802060395182e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 591 }, { "completion_length": 571.0, "epoch": 0.164079822616408, "grad_norm": 0.27580806612968445, "kl": 0.012459410354495049, "learning_rate": 4.990764502631408e-06, "loss": 0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 592 }, { "completion_length": 497.25, "epoch": 0.1643569844789357, "grad_norm": 0.0, "kl": 0.013315794989466667, "learning_rate": 4.9907268684862486e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 593 }, { "completion_length": 573.75, "epoch": 0.16463414634146342, "grad_norm": 0.27350330352783203, "kl": 0.011269363574683666, "learning_rate": 4.990689157960856e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 594 }, { "completion_length": 531.5, "epoch": 0.16491130820399114, "grad_norm": 0.0, "kl": 0.01219314057379961, "learning_rate": 4.99065137105639e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 595 }, { "completion_length": 550.25, "epoch": 0.16518847006651885, "grad_norm": 0.26318472623825073, "kl": 0.018382603302598, "learning_rate": 4.990613507774006e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 596 }, { "completion_length": 524.5, "epoch": 0.16546563192904656, "grad_norm": 0.296364426612854, "kl": 0.017207426950335503, "learning_rate": 4.990575568114866e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 597 }, { "completion_length": 503.0, "epoch": 0.16574279379157428, "grad_norm": 0.3374629020690918, "kl": 0.03152107447385788, "learning_rate": 4.990537552080135e-06, "loss": 0.0, "reward": 3.71875, "reward_std": 1.8153712749481201, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 598 }, { "completion_length": 619.75, "epoch": 0.166019955654102, "grad_norm": 0.2439812272787094, "kl": 0.012068821117281914, "learning_rate": 4.990499459670977e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 599 }, { "completion_length": 861.25, "epoch": 0.1662971175166297, "grad_norm": 0.2447240650653839, "kl": 0.010713434778153896, "learning_rate": 4.99046129088856e-06, "loss": 0.0, "reward": 3.84375, "reward_std": 3.180957317352295, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 600 }, { "completion_length": 454.0, "epoch": 0.16657427937915742, "grad_norm": 0.31299465894699097, "kl": 0.02412821166217327, "learning_rate": 4.990423045734057e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 601 }, { "completion_length": 493.5, "epoch": 0.16685144124168513, "grad_norm": 0.28118208050727844, "kl": 0.019647110253572464, "learning_rate": 4.9903847242086375e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 602 }, { "completion_length": 473.75, "epoch": 0.16712860310421285, "grad_norm": 0.3301386833190918, "kl": 0.017233913764357567, "learning_rate": 4.9903463263134784e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 603 }, { "completion_length": 538.75, "epoch": 0.1674057649667406, "grad_norm": 0.25734275579452515, "kl": 0.011503493413329124, "learning_rate": 4.9903078520497585e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 604 }, { "completion_length": 462.5, "epoch": 0.1676829268292683, "grad_norm": 0.286688894033432, "kl": 0.014761514961719513, "learning_rate": 4.990269301418655e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 605 }, { "completion_length": 466.0, "epoch": 0.16796008869179602, "grad_norm": 0.29520344734191895, "kl": 0.01951587200164795, "learning_rate": 4.990230674421352e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 606 }, { "completion_length": 497.0, "epoch": 0.16823725055432373, "grad_norm": 0.3253265619277954, "kl": 0.012495821341872215, "learning_rate": 4.990191971059033e-06, "loss": 0.0, "reward": 4.375, "reward_std": 2.136000871658325, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 607 }, { "completion_length": 473.0, "epoch": 0.16851441241685144, "grad_norm": 0.30183103680610657, "kl": 0.014760475605726242, "learning_rate": 4.990153191332885e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 608 }, { "completion_length": 504.5, "epoch": 0.16879157427937916, "grad_norm": 0.27625229954719543, "kl": 0.015464521944522858, "learning_rate": 4.990114335244097e-06, "loss": 0.0, "reward": 4.0, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 609 }, { "completion_length": 509.0, "epoch": 0.16906873614190687, "grad_norm": 0.24063682556152344, "kl": 0.015493592247366905, "learning_rate": 4.9900754027938615e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 610 }, { "completion_length": 484.0, "epoch": 0.1693458980044346, "grad_norm": 0.27927127480506897, "kl": 0.016056960448622704, "learning_rate": 4.990036393983372e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 611 }, { "completion_length": 497.5, "epoch": 0.1696230598669623, "grad_norm": 0.3216705024242401, "kl": 0.01610710285604, "learning_rate": 4.989997308813825e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 612 }, { "completion_length": 471.75, "epoch": 0.16990022172949001, "grad_norm": 0.0, "kl": 0.02982732281088829, "learning_rate": 4.989958147286418e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 613 }, { "completion_length": 461.75, "epoch": 0.17017738359201773, "grad_norm": 0.35726726055145264, "kl": 0.01583614945411682, "learning_rate": 4.989918909402353e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 614 }, { "completion_length": 586.25, "epoch": 0.17045454545454544, "grad_norm": 0.2799842059612274, "kl": 0.015740476548671722, "learning_rate": 4.989879595162832e-06, "loss": 0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 615 }, { "completion_length": 491.25, "epoch": 0.17073170731707318, "grad_norm": 0.2671460807323456, "kl": 0.016218988224864006, "learning_rate": 4.989840204569062e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 616 }, { "completion_length": 570.0, "epoch": 0.1710088691796009, "grad_norm": 0.290321409702301, "kl": 0.012621879577636719, "learning_rate": 4.98980073762225e-06, "loss": -0.0, "reward": 4.375, "reward_std": 1.4930394887924194, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 617 }, { "completion_length": 491.75, "epoch": 0.1712860310421286, "grad_norm": 0.30281129479408264, "kl": 0.01589544117450714, "learning_rate": 4.989761194323607e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 618 }, { "completion_length": 546.0, "epoch": 0.17156319290465633, "grad_norm": 0.28183916211128235, "kl": 0.016300290822982788, "learning_rate": 4.989721574674345e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 619 }, { "completion_length": 476.5, "epoch": 0.17184035476718404, "grad_norm": 0.29644775390625, "kl": 0.012710604816675186, "learning_rate": 4.98968187867568e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 620 }, { "completion_length": 549.25, "epoch": 0.17211751662971175, "grad_norm": 0.2744942009449005, "kl": 0.01352127268910408, "learning_rate": 4.989642106328829e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 621 }, { "completion_length": 482.0, "epoch": 0.17239467849223947, "grad_norm": 0.0, "kl": 0.019648486748337746, "learning_rate": 4.98960225763501e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 622 }, { "completion_length": 502.0, "epoch": 0.17267184035476718, "grad_norm": 0.30541110038757324, "kl": 0.014126002788543701, "learning_rate": 4.989562332595447e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 623 }, { "completion_length": 376.25, "epoch": 0.1729490022172949, "grad_norm": 0.36670830845832825, "kl": 0.022599278017878532, "learning_rate": 4.9895223312113636e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 624 }, { "completion_length": 515.0, "epoch": 0.1732261640798226, "grad_norm": 0.2826448082923889, "kl": 0.016482625156641006, "learning_rate": 4.989482253483986e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 625 }, { "completion_length": 501.5, "epoch": 0.17350332594235032, "grad_norm": 0.2871754765510559, "kl": 0.014459342695772648, "learning_rate": 4.989442099414543e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 626 }, { "completion_length": 484.25, "epoch": 0.17378048780487804, "grad_norm": 0.25067001581192017, "kl": 0.015047120861709118, "learning_rate": 4.989401869004268e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 627 }, { "completion_length": 520.0, "epoch": 0.17405764966740578, "grad_norm": 0.3301384150981903, "kl": 0.014846688136458397, "learning_rate": 4.989361562254394e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 628 }, { "completion_length": 613.5, "epoch": 0.1743348115299335, "grad_norm": 0.2663516104221344, "kl": 0.013788449577987194, "learning_rate": 4.9893211791661545e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 629 }, { "completion_length": 521.75, "epoch": 0.1746119733924612, "grad_norm": 0.30407488346099854, "kl": 0.01589541882276535, "learning_rate": 4.98928071974079e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 630 }, { "completion_length": 513.5, "epoch": 0.17488913525498892, "grad_norm": 0.3004229962825775, "kl": 0.01469044666737318, "learning_rate": 4.989240183979542e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 631 }, { "completion_length": 460.0, "epoch": 0.17516629711751663, "grad_norm": 0.31210798025131226, "kl": 0.018655266612768173, "learning_rate": 4.989199571883652e-06, "loss": -0.0, "reward": 3.40625, "reward_std": 1.929418683052063, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 632 }, { "completion_length": 516.75, "epoch": 0.17544345898004435, "grad_norm": 0.25181135535240173, "kl": 0.016144586727023125, "learning_rate": 4.989158883454366e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 633 }, { "completion_length": 468.5, "epoch": 0.17572062084257206, "grad_norm": 0.3527241349220276, "kl": 0.01797572150826454, "learning_rate": 4.9891181186929315e-06, "loss": 0.0, "reward": 4.25, "reward_std": 1.9148542881011963, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 634 }, { "completion_length": 472.5, "epoch": 0.17599778270509978, "grad_norm": 0.3868396580219269, "kl": 0.01888992451131344, "learning_rate": 4.989077277600599e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 635 }, { "completion_length": 504.0, "epoch": 0.1762749445676275, "grad_norm": 0.32479724287986755, "kl": 0.01398751325905323, "learning_rate": 4.98903636017862e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 636 }, { "completion_length": 559.0, "epoch": 0.1765521064301552, "grad_norm": 0.44162577390670776, "kl": 0.014655462466180325, "learning_rate": 4.988995366428251e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 637 }, { "completion_length": 625.5, "epoch": 0.17682926829268292, "grad_norm": 0.2741512954235077, "kl": 0.01312557514756918, "learning_rate": 4.988954296350747e-06, "loss": 0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 638 }, { "completion_length": 501.75, "epoch": 0.17710643015521063, "grad_norm": 0.26595285534858704, "kl": 0.023311911150813103, "learning_rate": 4.98891314994737e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 639 }, { "completion_length": 432.5, "epoch": 0.17738359201773837, "grad_norm": 0.34854888916015625, "kl": 0.060104332864284515, "learning_rate": 4.98887192721938e-06, "loss": -0.0, "reward": 3.0, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 640 }, { "completion_length": 532.5, "epoch": 0.1776607538802661, "grad_norm": 0.27584362030029297, "kl": 0.021874817088246346, "learning_rate": 4.9888306281680405e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 641 }, { "completion_length": 496.75, "epoch": 0.1779379157427938, "grad_norm": 0.326592355966568, "kl": 0.01839117519557476, "learning_rate": 4.988789252794619e-06, "loss": 0.0, "reward": 4.0, "reward_std": 2.020725965499878, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 642 }, { "completion_length": 455.25, "epoch": 0.17821507760532151, "grad_norm": 0.28637126088142395, "kl": 0.019522879272699356, "learning_rate": 4.988747801100385e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.314977765083313, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 643 }, { "completion_length": 474.0, "epoch": 0.17849223946784923, "grad_norm": 0.0, "kl": 0.01786750555038452, "learning_rate": 4.988706273086608e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 644 }, { "completion_length": 487.0, "epoch": 0.17876940133037694, "grad_norm": 0.2858082056045532, "kl": 0.015546293929219246, "learning_rate": 4.988664668754563e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 645 }, { "completion_length": 459.75, "epoch": 0.17904656319290466, "grad_norm": 0.30719637870788574, "kl": 0.018810003995895386, "learning_rate": 4.988622988105525e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 646 }, { "completion_length": 509.5, "epoch": 0.17932372505543237, "grad_norm": 0.30032268166542053, "kl": 0.016920914873480797, "learning_rate": 4.988581231140772e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 647 }, { "completion_length": 470.25, "epoch": 0.17960088691796008, "grad_norm": 0.28382131457328796, "kl": 0.01654200069606304, "learning_rate": 4.988539397861586e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 648 }, { "completion_length": 420.75, "epoch": 0.1798780487804878, "grad_norm": 0.3374135196208954, "kl": 0.02220698818564415, "learning_rate": 4.988497488269248e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 649 }, { "completion_length": 566.25, "epoch": 0.1801552106430155, "grad_norm": 0.24336610734462738, "kl": 0.016483157873153687, "learning_rate": 4.988455502365044e-06, "loss": -0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 650 }, { "completion_length": 471.25, "epoch": 0.18043237250554323, "grad_norm": 0.28418511152267456, "kl": 0.020266445353627205, "learning_rate": 4.988413440150261e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 651 }, { "completion_length": 440.0, "epoch": 0.18070953436807094, "grad_norm": 0.3385154604911804, "kl": 0.018640443682670593, "learning_rate": 4.98837130162619e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 652 }, { "completion_length": 477.75, "epoch": 0.18098669623059868, "grad_norm": 0.3234453797340393, "kl": 0.018160749226808548, "learning_rate": 4.988329086794122e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 653 }, { "completion_length": 496.75, "epoch": 0.1812638580931264, "grad_norm": 0.5715420842170715, "kl": 0.02003759704530239, "learning_rate": 4.988286795655353e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 654 }, { "completion_length": 516.0, "epoch": 0.1815410199556541, "grad_norm": 0.24533885717391968, "kl": 0.013808531686663628, "learning_rate": 4.988244428211178e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 655 }, { "completion_length": 570.75, "epoch": 0.18181818181818182, "grad_norm": 0.267109215259552, "kl": 0.015175247564911842, "learning_rate": 4.988201984462897e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 656 }, { "completion_length": 558.0, "epoch": 0.18209534368070954, "grad_norm": 0.3129168450832367, "kl": 0.014518789015710354, "learning_rate": 4.9881594644118125e-06, "loss": -0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 657 }, { "completion_length": 490.0, "epoch": 0.18237250554323725, "grad_norm": 0.29545313119888306, "kl": 0.01587422378361225, "learning_rate": 4.9881168680592274e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 658 }, { "completion_length": 587.25, "epoch": 0.18264966740576496, "grad_norm": 0.2517118453979492, "kl": 0.012477448210120201, "learning_rate": 4.988074195406449e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 659 }, { "completion_length": 464.75, "epoch": 0.18292682926829268, "grad_norm": 0.40868687629699707, "kl": 0.017503373324871063, "learning_rate": 4.988031446454784e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 660 }, { "completion_length": 565.75, "epoch": 0.1832039911308204, "grad_norm": 0.27332305908203125, "kl": 0.013960846699774265, "learning_rate": 4.987988621205546e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.041241407394409, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 661 }, { "completion_length": 500.0, "epoch": 0.1834811529933481, "grad_norm": 0.3823421895503998, "kl": 0.014911696314811707, "learning_rate": 4.987945719660046e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 662 }, { "completion_length": 630.25, "epoch": 0.18375831485587582, "grad_norm": 0.2895660698413849, "kl": 0.015531031414866447, "learning_rate": 4.9879027418196e-06, "loss": -0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 663 }, { "completion_length": 506.5, "epoch": 0.18403547671840353, "grad_norm": 0.29177072644233704, "kl": 0.013314553536474705, "learning_rate": 4.987859687685526e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 664 }, { "completion_length": 602.0, "epoch": 0.18431263858093128, "grad_norm": 0.32873037457466125, "kl": 0.013525662012398243, "learning_rate": 4.987816557259145e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 665 }, { "completion_length": 545.0, "epoch": 0.184589800443459, "grad_norm": 0.26758530735969543, "kl": 0.012226082384586334, "learning_rate": 4.98777335054178e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 666 }, { "completion_length": 538.5, "epoch": 0.1848669623059867, "grad_norm": 0.26461780071258545, "kl": 0.016506390646100044, "learning_rate": 4.987730067534754e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 667 }, { "completion_length": 525.75, "epoch": 0.18514412416851442, "grad_norm": 0.2866877317428589, "kl": 0.012225452810525894, "learning_rate": 4.987686708239396e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 668 }, { "completion_length": 498.25, "epoch": 0.18542128603104213, "grad_norm": 0.30999526381492615, "kl": 0.012925153598189354, "learning_rate": 4.9876432726570364e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 669 }, { "completion_length": 596.25, "epoch": 0.18569844789356985, "grad_norm": 0.26960721611976624, "kl": 0.014636464416980743, "learning_rate": 4.987599760789005e-06, "loss": -0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 670 }, { "completion_length": 563.0, "epoch": 0.18597560975609756, "grad_norm": 0.2842220366001129, "kl": 0.016180483624339104, "learning_rate": 4.987556172636637e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 671 }, { "completion_length": 440.0, "epoch": 0.18625277161862527, "grad_norm": 0.3479776978492737, "kl": 0.014780635945498943, "learning_rate": 4.987512508201269e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 672 }, { "completion_length": 519.5, "epoch": 0.186529933481153, "grad_norm": 0.24534489214420319, "kl": 0.014544305391609669, "learning_rate": 4.98746876748424e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 673 }, { "completion_length": 588.0, "epoch": 0.1868070953436807, "grad_norm": 0.2538903057575226, "kl": 0.018481513485312462, "learning_rate": 4.987424950486892e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 674 }, { "completion_length": 503.0, "epoch": 0.18708425720620842, "grad_norm": 0.24676164984703064, "kl": 0.017846444621682167, "learning_rate": 4.987381057210568e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 675 }, { "completion_length": 528.75, "epoch": 0.18736141906873613, "grad_norm": 0.2558610737323761, "kl": 0.015851646661758423, "learning_rate": 4.987337087656614e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 676 }, { "completion_length": 456.75, "epoch": 0.18763858093126387, "grad_norm": 0.3188142776489258, "kl": 0.018758637830615044, "learning_rate": 4.987293041826379e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 677 }, { "completion_length": 527.75, "epoch": 0.18791574279379158, "grad_norm": 0.31371644139289856, "kl": 0.0201277993619442, "learning_rate": 4.987248919721213e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 678 }, { "completion_length": 467.25, "epoch": 0.1881929046563193, "grad_norm": 0.3149774670600891, "kl": 0.02090691775083542, "learning_rate": 4.98720472134247e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 679 }, { "completion_length": 481.25, "epoch": 0.188470066518847, "grad_norm": 0.34842681884765625, "kl": 0.017783470451831818, "learning_rate": 4.987160446691504e-06, "loss": 0.0, "reward": 4.59375, "reward_std": 2.3125, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 680 }, { "completion_length": 548.25, "epoch": 0.18874722838137473, "grad_norm": 0.0, "kl": 0.01646880991756916, "learning_rate": 4.987116095769674e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 681 }, { "completion_length": 633.0, "epoch": 0.18902439024390244, "grad_norm": 0.26041746139526367, "kl": 0.015002479776740074, "learning_rate": 4.987071668578339e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 682 }, { "completion_length": 572.0, "epoch": 0.18930155210643015, "grad_norm": 0.3190310597419739, "kl": 0.014419038780033588, "learning_rate": 4.987027165118862e-06, "loss": 0.0, "reward": 3.84375, "reward_std": 2.5235867500305176, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 683 }, { "completion_length": 533.75, "epoch": 0.18957871396895787, "grad_norm": 0.3302563428878784, "kl": 0.019702976569533348, "learning_rate": 4.986982585392608e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 684 }, { "completion_length": 491.75, "epoch": 0.18985587583148558, "grad_norm": 0.0, "kl": 0.014371244236826897, "learning_rate": 4.9869379294009425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 685 }, { "completion_length": 494.5, "epoch": 0.1901330376940133, "grad_norm": 0.342250794172287, "kl": 0.016446148976683617, "learning_rate": 4.986893197145238e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 686 }, { "completion_length": 524.5, "epoch": 0.190410199556541, "grad_norm": 0.283232718706131, "kl": 0.016477465629577637, "learning_rate": 4.986848388626863e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 687 }, { "completion_length": 574.5, "epoch": 0.19068736141906872, "grad_norm": 0.29094016551971436, "kl": 0.017400063574314117, "learning_rate": 4.986803503847193e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 688 }, { "completion_length": 482.25, "epoch": 0.19096452328159647, "grad_norm": 0.3430641293525696, "kl": 0.023926349356770515, "learning_rate": 4.986758542807605e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 689 }, { "completion_length": 489.25, "epoch": 0.19124168514412418, "grad_norm": 0.3184714913368225, "kl": 0.015062497928738594, "learning_rate": 4.986713505509476e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 690 }, { "completion_length": 483.5, "epoch": 0.1915188470066519, "grad_norm": 0.4596671462059021, "kl": 0.014818599447607994, "learning_rate": 4.986668391954189e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 691 }, { "completion_length": 556.5, "epoch": 0.1917960088691796, "grad_norm": 0.24628838896751404, "kl": 0.015135711058974266, "learning_rate": 4.986623202143127e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 692 }, { "completion_length": 495.0, "epoch": 0.19207317073170732, "grad_norm": 0.28440213203430176, "kl": 0.015263823792338371, "learning_rate": 4.986577936077676e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 693 }, { "completion_length": 572.5, "epoch": 0.19235033259423504, "grad_norm": 0.2325010746717453, "kl": 0.013552853837609291, "learning_rate": 4.986532593759222e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 694 }, { "completion_length": 606.25, "epoch": 0.19262749445676275, "grad_norm": 0.2510804235935211, "kl": 0.016212256625294685, "learning_rate": 4.986487175189159e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 695 }, { "completion_length": 481.0, "epoch": 0.19290465631929046, "grad_norm": 0.3215952217578888, "kl": 0.017028754577040672, "learning_rate": 4.986441680368877e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 696 }, { "completion_length": 528.0, "epoch": 0.19318181818181818, "grad_norm": 0.0, "kl": 0.017593160271644592, "learning_rate": 4.986396109299771e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 697 }, { "completion_length": 588.25, "epoch": 0.1934589800443459, "grad_norm": 0.292722225189209, "kl": 0.014229295775294304, "learning_rate": 4.98635046198324e-06, "loss": -0.0, "reward": 2.8125, "reward_std": 1.9618761539459229, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 698 }, { "completion_length": 418.25, "epoch": 0.1937361419068736, "grad_norm": 0.0, "kl": 0.019511643797159195, "learning_rate": 4.986304738420684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 699 }, { "completion_length": 608.25, "epoch": 0.19401330376940132, "grad_norm": 0.2607821822166443, "kl": 0.014289142563939095, "learning_rate": 4.986258938613504e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 700 }, { "completion_length": 467.5, "epoch": 0.19429046563192906, "grad_norm": 0.297850638628006, "kl": 0.020164215937256813, "learning_rate": 4.986213062563104e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 701 }, { "completion_length": 540.25, "epoch": 0.19456762749445677, "grad_norm": 0.25706377625465393, "kl": 0.019216051325201988, "learning_rate": 4.986167110270893e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 702 }, { "completion_length": 484.0, "epoch": 0.1948447893569845, "grad_norm": 0.2961105704307556, "kl": 0.02108294703066349, "learning_rate": 4.986121081738279e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 703 }, { "completion_length": 546.0, "epoch": 0.1951219512195122, "grad_norm": 0.3241623640060425, "kl": 0.014842013828456402, "learning_rate": 4.986074976966672e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 704 }, { "completion_length": 496.25, "epoch": 0.19539911308203992, "grad_norm": 0.2834533452987671, "kl": 0.02097104676067829, "learning_rate": 4.986028795957489e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 705 }, { "completion_length": 485.0, "epoch": 0.19567627494456763, "grad_norm": 0.36659395694732666, "kl": 0.03226753696799278, "learning_rate": 4.985982538712144e-06, "loss": 0.0, "reward": 3.625, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 706 }, { "completion_length": 509.0, "epoch": 0.19595343680709534, "grad_norm": 0.0, "kl": 0.013087481260299683, "learning_rate": 4.985936205232056e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 707 }, { "completion_length": 551.0, "epoch": 0.19623059866962306, "grad_norm": 0.271384060382843, "kl": 0.03143215551972389, "learning_rate": 4.985889795518646e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 708 }, { "completion_length": 493.25, "epoch": 0.19650776053215077, "grad_norm": 0.2817094624042511, "kl": 0.020037773996591568, "learning_rate": 4.985843309573336e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 709 }, { "completion_length": 502.0, "epoch": 0.19678492239467849, "grad_norm": 0.29904651641845703, "kl": 0.0158680472522974, "learning_rate": 4.985796747397553e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 710 }, { "completion_length": 523.0, "epoch": 0.1970620842572062, "grad_norm": 0.2594645917415619, "kl": 0.019786491990089417, "learning_rate": 4.985750108992725e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 711 }, { "completion_length": 502.75, "epoch": 0.1973392461197339, "grad_norm": 0.31635233759880066, "kl": 0.016957612708210945, "learning_rate": 4.985703394360281e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 712 }, { "completion_length": 504.75, "epoch": 0.19761640798226163, "grad_norm": 0.2798074185848236, "kl": 0.022341173142194748, "learning_rate": 4.985656603501654e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 713 }, { "completion_length": 503.0, "epoch": 0.19789356984478937, "grad_norm": 0.0, "kl": 0.021068597212433815, "learning_rate": 4.98560973641828e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 714 }, { "completion_length": 566.75, "epoch": 0.19817073170731708, "grad_norm": 0.29654109477996826, "kl": 0.01875092275440693, "learning_rate": 4.985562793111594e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 715 }, { "completion_length": 524.75, "epoch": 0.1984478935698448, "grad_norm": 0.0, "kl": 0.01779516413807869, "learning_rate": 4.985515773583038e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 716 }, { "completion_length": 474.75, "epoch": 0.1987250554323725, "grad_norm": 0.31776195764541626, "kl": 0.019880501553416252, "learning_rate": 4.985468677834052e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 717 }, { "completion_length": 517.75, "epoch": 0.19900221729490022, "grad_norm": 0.31698447465896606, "kl": 0.018504437059164047, "learning_rate": 4.985421505866081e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 718 }, { "completion_length": 528.75, "epoch": 0.19927937915742794, "grad_norm": 0.3119114637374878, "kl": 0.017028018832206726, "learning_rate": 4.985374257680572e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 719 }, { "completion_length": 561.0, "epoch": 0.19955654101995565, "grad_norm": 0.2883661091327667, "kl": 0.01798996329307556, "learning_rate": 4.9853269332789725e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 720 }, { "completion_length": 611.5, "epoch": 0.19983370288248337, "grad_norm": 0.24705825746059418, "kl": 0.015276697464287281, "learning_rate": 4.985279532662736e-06, "loss": 0.0, "reward": 3.4375, "reward_std": 1.9724667072296143, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 721 }, { "completion_length": 526.75, "epoch": 0.20011086474501108, "grad_norm": 0.0, "kl": 0.020037135109305382, "learning_rate": 4.985232055833314e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 722 }, { "completion_length": 540.25, "epoch": 0.2003880266075388, "grad_norm": 0.27179664373397827, "kl": 0.015667878091335297, "learning_rate": 4.985184502792162e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 723 }, { "completion_length": 552.5, "epoch": 0.2006651884700665, "grad_norm": 0.2781732678413391, "kl": 0.01817716658115387, "learning_rate": 4.985136873540741e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 724 }, { "completion_length": 584.0, "epoch": 0.20094235033259422, "grad_norm": 0.25268349051475525, "kl": 0.018594888970255852, "learning_rate": 4.985089168080509e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 725 }, { "completion_length": 419.25, "epoch": 0.20121951219512196, "grad_norm": 0.0, "kl": 0.021978231146931648, "learning_rate": 4.985041386412931e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 726 }, { "completion_length": 563.25, "epoch": 0.20149667405764968, "grad_norm": 0.2749530076980591, "kl": 0.0175306499004364, "learning_rate": 4.984993528539471e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 727 }, { "completion_length": 529.5, "epoch": 0.2017738359201774, "grad_norm": 0.3593299686908722, "kl": 0.022030401974916458, "learning_rate": 4.984945594461596e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 728 }, { "completion_length": 515.25, "epoch": 0.2020509977827051, "grad_norm": 0.29327070713043213, "kl": 0.022453149780631065, "learning_rate": 4.984897584180777e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 729 }, { "completion_length": 528.0, "epoch": 0.20232815964523282, "grad_norm": 0.3085457384586334, "kl": 0.03400176018476486, "learning_rate": 4.984849497698487e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 730 }, { "completion_length": 480.25, "epoch": 0.20260532150776053, "grad_norm": 0.32194915413856506, "kl": 0.026344871148467064, "learning_rate": 4.984801335016198e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 731 }, { "completion_length": 495.5, "epoch": 0.20288248337028825, "grad_norm": 0.2849930226802826, "kl": 0.0221638735383749, "learning_rate": 4.98475309613539e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 732 }, { "completion_length": 548.5, "epoch": 0.20315964523281596, "grad_norm": 0.27746328711509705, "kl": 0.020619232207536697, "learning_rate": 4.98470478105754e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 733 }, { "completion_length": 480.0, "epoch": 0.20343680709534367, "grad_norm": 0.31370967626571655, "kl": 0.02127794362604618, "learning_rate": 4.9846563897841306e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 734 }, { "completion_length": 566.0, "epoch": 0.2037139689578714, "grad_norm": 0.273686021566391, "kl": 0.020528331398963928, "learning_rate": 4.984607922316646e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 735 }, { "completion_length": 559.5, "epoch": 0.2039911308203991, "grad_norm": 0.266985684633255, "kl": 0.02090374007821083, "learning_rate": 4.984559378656572e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 736 }, { "completion_length": 572.75, "epoch": 0.20426829268292682, "grad_norm": 0.0, "kl": 0.01874251663684845, "learning_rate": 4.984510758805397e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 737 }, { "completion_length": 497.75, "epoch": 0.20454545454545456, "grad_norm": 0.3090013563632965, "kl": 0.021400412544608116, "learning_rate": 4.9844620627646125e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 738 }, { "completion_length": 548.0, "epoch": 0.20482261640798227, "grad_norm": 0.3706728219985962, "kl": 0.01748868264257908, "learning_rate": 4.984413290535711e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 739 }, { "completion_length": 507.0, "epoch": 0.20509977827050999, "grad_norm": 0.324607789516449, "kl": 0.038607917726039886, "learning_rate": 4.984364442120189e-06, "loss": 0.0, "reward": 5.5625, "reward_std": 0.375, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 740 }, { "completion_length": 607.0, "epoch": 0.2053769401330377, "grad_norm": 0.3417738974094391, "kl": 0.01764594577252865, "learning_rate": 4.984315517519546e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 741 }, { "completion_length": 501.75, "epoch": 0.2056541019955654, "grad_norm": 0.0, "kl": 0.019935254007577896, "learning_rate": 4.984266516735279e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 742 }, { "completion_length": 458.25, "epoch": 0.20593126385809313, "grad_norm": 0.34117597341537476, "kl": 0.04974701255559921, "learning_rate": 4.984217439768892e-06, "loss": -0.0, "reward": 2.8125, "reward_std": 1.983000636100769, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 743 }, { "completion_length": 520.0, "epoch": 0.20620842572062084, "grad_norm": 0.0, "kl": 0.020387694239616394, "learning_rate": 4.984168286621892e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 744 }, { "completion_length": 557.75, "epoch": 0.20648558758314856, "grad_norm": 0.24825549125671387, "kl": 0.01496980246156454, "learning_rate": 4.984119057295783e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 745 }, { "completion_length": 558.0, "epoch": 0.20676274944567627, "grad_norm": 0.2798282206058502, "kl": 0.0184344369918108, "learning_rate": 4.984069751792077e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 746 }, { "completion_length": 439.25, "epoch": 0.20703991130820398, "grad_norm": 0.326139897108078, "kl": 0.02269004099071026, "learning_rate": 4.9840203701122844e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 747 }, { "completion_length": 522.75, "epoch": 0.2073170731707317, "grad_norm": 0.2700923979282379, "kl": 0.026969650760293007, "learning_rate": 4.98397091225792e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 748 }, { "completion_length": 568.25, "epoch": 0.2075942350332594, "grad_norm": 0.2621520161628723, "kl": 0.021514682099223137, "learning_rate": 4.9839213782305015e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 749 }, { "completion_length": 464.75, "epoch": 0.20787139689578715, "grad_norm": 0.3132356107234955, "kl": 0.019771268591284752, "learning_rate": 4.983871768031548e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 750 }, { "completion_length": 541.75, "epoch": 0.20814855875831487, "grad_norm": 0.27938398718833923, "kl": 0.019035225734114647, "learning_rate": 4.983822081662578e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 751 }, { "completion_length": 624.0, "epoch": 0.20842572062084258, "grad_norm": 0.210989311337471, "kl": 0.016687212511897087, "learning_rate": 4.983772319125118e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 752 }, { "completion_length": 567.5, "epoch": 0.2087028824833703, "grad_norm": 0.0, "kl": 0.022662362083792686, "learning_rate": 4.983722480420694e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 753 }, { "completion_length": 572.5, "epoch": 0.208980044345898, "grad_norm": 0.0, "kl": 0.01484181359410286, "learning_rate": 4.983672565550834e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 754 }, { "completion_length": 447.25, "epoch": 0.20925720620842572, "grad_norm": 0.4025910496711731, "kl": 0.10996074974536896, "learning_rate": 4.983622574517066e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 755 }, { "completion_length": 555.75, "epoch": 0.20953436807095344, "grad_norm": 0.2488221824169159, "kl": 0.015928253531455994, "learning_rate": 4.983572507320928e-06, "loss": 0.0, "reward": 4.8125, "reward_std": 1.875, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 756 }, { "completion_length": 425.75, "epoch": 0.20981152993348115, "grad_norm": 0.3245978355407715, "kl": 0.021901341155171394, "learning_rate": 4.98352236396395e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 757 }, { "completion_length": 502.75, "epoch": 0.21008869179600886, "grad_norm": 0.2737119495868683, "kl": 0.018232164904475212, "learning_rate": 4.983472144447673e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 758 }, { "completion_length": 462.0, "epoch": 0.21036585365853658, "grad_norm": 0.0, "kl": 0.019390055909752846, "learning_rate": 4.983421848773638e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 759 }, { "completion_length": 519.75, "epoch": 0.2106430155210643, "grad_norm": 0.3725380301475525, "kl": 0.017390461638569832, "learning_rate": 4.983371476943384e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 760 }, { "completion_length": 491.25, "epoch": 0.210920177383592, "grad_norm": 0.34917277097702026, "kl": 0.020555460825562477, "learning_rate": 4.9833210289584574e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 761 }, { "completion_length": 434.25, "epoch": 0.21119733924611975, "grad_norm": 0.0, "kl": 0.021596763283014297, "learning_rate": 4.9832705048204056e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 762 }, { "completion_length": 488.5, "epoch": 0.21147450110864746, "grad_norm": 0.28092244267463684, "kl": 0.01867128722369671, "learning_rate": 4.983219904530778e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 763 }, { "completion_length": 539.25, "epoch": 0.21175166297117518, "grad_norm": 0.30547571182250977, "kl": 0.02181440219283104, "learning_rate": 4.983169228091125e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 764 }, { "completion_length": 564.0, "epoch": 0.2120288248337029, "grad_norm": 0.0, "kl": 0.01906001940369606, "learning_rate": 4.983118475503003e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 765 }, { "completion_length": 444.25, "epoch": 0.2123059866962306, "grad_norm": 0.0, "kl": 0.018885089084506035, "learning_rate": 4.983067646767965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 766 }, { "completion_length": 529.5, "epoch": 0.21258314855875832, "grad_norm": 0.33771273493766785, "kl": 0.0210456233471632, "learning_rate": 4.983016741887573e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 767 }, { "completion_length": 527.5, "epoch": 0.21286031042128603, "grad_norm": 0.29220226407051086, "kl": 0.022220009937882423, "learning_rate": 4.982965760863386e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 768 }, { "completion_length": 533.5, "epoch": 0.21313747228381374, "grad_norm": 0.27981311082839966, "kl": 0.019755125045776367, "learning_rate": 4.982914703696967e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 769 }, { "completion_length": 536.25, "epoch": 0.21341463414634146, "grad_norm": 0.0, "kl": 0.019189266487956047, "learning_rate": 4.982863570389884e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 770 }, { "completion_length": 438.75, "epoch": 0.21369179600886917, "grad_norm": 0.28348249197006226, "kl": 0.02700541354715824, "learning_rate": 4.982812360943704e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 771 }, { "completion_length": 461.25, "epoch": 0.2139689578713969, "grad_norm": 0.0, "kl": 0.023209549486637115, "learning_rate": 4.982761075359996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 772 }, { "completion_length": 478.25, "epoch": 0.2142461197339246, "grad_norm": 0.2965247333049774, "kl": 0.019414717331528664, "learning_rate": 4.982709713640335e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 773 }, { "completion_length": 589.25, "epoch": 0.21452328159645231, "grad_norm": 0.27074089646339417, "kl": 0.017208445817232132, "learning_rate": 4.982658275786294e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 774 }, { "completion_length": 560.25, "epoch": 0.21480044345898006, "grad_norm": 0.26861152052879333, "kl": 0.018620409071445465, "learning_rate": 4.9826067617994515e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 775 }, { "completion_length": 479.25, "epoch": 0.21507760532150777, "grad_norm": 0.0, "kl": 0.019963320344686508, "learning_rate": 4.982555171681388e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 776 }, { "completion_length": 483.75, "epoch": 0.21535476718403548, "grad_norm": 0.0, "kl": 0.024769550189375877, "learning_rate": 4.982503505433683e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 777 }, { "completion_length": 568.75, "epoch": 0.2156319290465632, "grad_norm": 0.0, "kl": 0.016771981492638588, "learning_rate": 4.982451763057923e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 778 }, { "completion_length": 528.0, "epoch": 0.2159090909090909, "grad_norm": 0.3473654091358185, "kl": 0.01955060474574566, "learning_rate": 4.982399944555695e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 779 }, { "completion_length": 520.75, "epoch": 0.21618625277161863, "grad_norm": 0.0, "kl": 0.019809532910585403, "learning_rate": 4.982348049928586e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 780 }, { "completion_length": 566.5, "epoch": 0.21646341463414634, "grad_norm": 0.2691117823123932, "kl": 0.016962002962827682, "learning_rate": 4.982296079178189e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 781 }, { "completion_length": 446.25, "epoch": 0.21674057649667405, "grad_norm": 0.34900882840156555, "kl": 0.035438623279333115, "learning_rate": 4.982244032306097e-06, "loss": -0.0, "reward": 5.34375, "reward_std": 0.49344661831855774, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 782 }, { "completion_length": 479.5, "epoch": 0.21701773835920177, "grad_norm": 0.0, "kl": 0.016627632081508636, "learning_rate": 4.9821919093139076e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 783 }, { "completion_length": 560.25, "epoch": 0.21729490022172948, "grad_norm": 0.29163771867752075, "kl": 0.02037329412996769, "learning_rate": 4.982139710203218e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 784 }, { "completion_length": 545.0, "epoch": 0.2175720620842572, "grad_norm": 0.2792951166629791, "kl": 0.017334170639514923, "learning_rate": 4.982087434975628e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 785 }, { "completion_length": 496.0, "epoch": 0.2178492239467849, "grad_norm": 0.27233922481536865, "kl": 0.015808584168553352, "learning_rate": 4.9820350836327415e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 786 }, { "completion_length": 537.25, "epoch": 0.21812638580931265, "grad_norm": 0.30757907032966614, "kl": 0.017278878018260002, "learning_rate": 4.981982656176164e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 787 }, { "completion_length": 490.75, "epoch": 0.21840354767184036, "grad_norm": 0.3458501994609833, "kl": 0.01946025714278221, "learning_rate": 4.981930152607504e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 788 }, { "completion_length": 427.75, "epoch": 0.21868070953436808, "grad_norm": 0.38121774792671204, "kl": 0.021321192383766174, "learning_rate": 4.98187757292837e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 789 }, { "completion_length": 495.75, "epoch": 0.2189578713968958, "grad_norm": 0.0, "kl": 0.019769325852394104, "learning_rate": 4.981824917140376e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 790 }, { "completion_length": 675.0, "epoch": 0.2192350332594235, "grad_norm": 0.33103570342063904, "kl": 0.02024010382592678, "learning_rate": 4.981772185245135e-06, "loss": 0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 791 }, { "completion_length": 557.0, "epoch": 0.21951219512195122, "grad_norm": 0.2722875475883484, "kl": 0.018411610275506973, "learning_rate": 4.981719377244266e-06, "loss": -0.0, "reward": 3.4375, "reward_std": 2.7185704708099365, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 792 }, { "completion_length": 607.75, "epoch": 0.21978935698447893, "grad_norm": 0.2501823306083679, "kl": 0.013551697134971619, "learning_rate": 4.9816664931393865e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 793 }, { "completion_length": 449.0, "epoch": 0.22006651884700665, "grad_norm": 0.28382161259651184, "kl": 0.02531256526708603, "learning_rate": 4.981613532932119e-06, "loss": 0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 794 }, { "completion_length": 476.5, "epoch": 0.22034368070953436, "grad_norm": 0.2765817940235138, "kl": 0.022134456783533096, "learning_rate": 4.981560496624089e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 795 }, { "completion_length": 436.0, "epoch": 0.22062084257206208, "grad_norm": 0.34126514196395874, "kl": 0.020724505186080933, "learning_rate": 4.98150738421692e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 796 }, { "completion_length": 457.75, "epoch": 0.2208980044345898, "grad_norm": 0.2728124260902405, "kl": 0.016548385843634605, "learning_rate": 4.981454195712244e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 797 }, { "completion_length": 569.5, "epoch": 0.2211751662971175, "grad_norm": 0.26085343956947327, "kl": 0.016441812738776207, "learning_rate": 4.981400931111689e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 798 }, { "completion_length": 497.0, "epoch": 0.22145232815964525, "grad_norm": 0.3268621265888214, "kl": 0.015847239643335342, "learning_rate": 4.98134759041689e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 799 }, { "completion_length": 526.5, "epoch": 0.22172949002217296, "grad_norm": 0.24548351764678955, "kl": 0.01700493134558201, "learning_rate": 4.981294173629484e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 800 }, { "completion_length": 570.75, "epoch": 0.22200665188470067, "grad_norm": 0.27694517374038696, "kl": 0.016903389245271683, "learning_rate": 4.981240680751106e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 801 }, { "completion_length": 534.5, "epoch": 0.2222838137472284, "grad_norm": 0.0, "kl": 0.016327057033777237, "learning_rate": 4.981187111783399e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 802 }, { "completion_length": 525.75, "epoch": 0.2225609756097561, "grad_norm": 0.3122382164001465, "kl": 0.01879364624619484, "learning_rate": 4.981133466728004e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 803 }, { "completion_length": 404.5, "epoch": 0.22283813747228381, "grad_norm": 0.30139172077178955, "kl": 0.025796644389629364, "learning_rate": 4.981079745586568e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 804 }, { "completion_length": 583.5, "epoch": 0.22311529933481153, "grad_norm": 0.3055345416069031, "kl": 0.015706758946180344, "learning_rate": 4.981025948360736e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 805 }, { "completion_length": 543.25, "epoch": 0.22339246119733924, "grad_norm": 0.27308082580566406, "kl": 0.016151010990142822, "learning_rate": 4.980972075052159e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 806 }, { "completion_length": 490.5, "epoch": 0.22366962305986696, "grad_norm": 0.2758483588695526, "kl": 0.019977793097496033, "learning_rate": 4.9809181256624895e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 807 }, { "completion_length": 493.75, "epoch": 0.22394678492239467, "grad_norm": 0.44748446345329285, "kl": 0.019281120970845222, "learning_rate": 4.980864100193382e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 808 }, { "completion_length": 533.5, "epoch": 0.22422394678492238, "grad_norm": 0.27740657329559326, "kl": 0.01536676287651062, "learning_rate": 4.980809998646492e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 809 }, { "completion_length": 608.0, "epoch": 0.2245011086474501, "grad_norm": 0.2660789489746094, "kl": 0.015444022603332996, "learning_rate": 4.9807558210234784e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 810 }, { "completion_length": 483.25, "epoch": 0.22477827050997784, "grad_norm": 0.308624267578125, "kl": 0.01946621760725975, "learning_rate": 4.980701567326005e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 811 }, { "completion_length": 454.75, "epoch": 0.22505543237250555, "grad_norm": 0.3050016760826111, "kl": 0.01569025218486786, "learning_rate": 4.980647237555734e-06, "loss": 0.0, "reward": 3.6875, "reward_std": 2.2395591735839844, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 812 }, { "completion_length": 668.25, "epoch": 0.22533259423503327, "grad_norm": 0.2522878646850586, "kl": 0.035403117537498474, "learning_rate": 4.98059283171433e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 813 }, { "completion_length": 461.25, "epoch": 0.22560975609756098, "grad_norm": 0.3264232873916626, "kl": 0.016971362754702568, "learning_rate": 4.980538349803463e-06, "loss": 0.0, "reward": 4.78125, "reward_std": 1.6967339515686035, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 814 }, { "completion_length": 532.75, "epoch": 0.2258869179600887, "grad_norm": 0.29251036047935486, "kl": 0.015956146642565727, "learning_rate": 4.980483791824805e-06, "loss": 0.0, "reward": 3.65625, "reward_std": 2.4224965572357178, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 815 }, { "completion_length": 520.5, "epoch": 0.2261640798226164, "grad_norm": 0.30170685052871704, "kl": 0.018171321600675583, "learning_rate": 4.980429157780027e-06, "loss": -0.0, "reward": 4.46875, "reward_std": 2.1659653186798096, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 816 }, { "completion_length": 474.75, "epoch": 0.22644124168514412, "grad_norm": 0.0, "kl": 0.017544658854603767, "learning_rate": 4.980374447670805e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 817 }, { "completion_length": 576.5, "epoch": 0.22671840354767184, "grad_norm": 0.24968773126602173, "kl": 0.01814916916191578, "learning_rate": 4.980319661498817e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 818 }, { "completion_length": 593.75, "epoch": 0.22699556541019955, "grad_norm": 0.0, "kl": 0.019223220646381378, "learning_rate": 4.9802647992657425e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 819 }, { "completion_length": 517.25, "epoch": 0.22727272727272727, "grad_norm": 0.33500805497169495, "kl": 0.017896898090839386, "learning_rate": 4.980209860973264e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 820 }, { "completion_length": 498.5, "epoch": 0.22754988913525498, "grad_norm": 0.0, "kl": 0.01674572005867958, "learning_rate": 4.980154846623067e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 821 }, { "completion_length": 559.25, "epoch": 0.2278270509977827, "grad_norm": 0.27772533893585205, "kl": 0.04434746876358986, "learning_rate": 4.980099756216838e-06, "loss": -0.0, "reward": 3.625, "reward_std": 2.462214469909668, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 822 }, { "completion_length": 470.75, "epoch": 0.22810421286031043, "grad_norm": 0.37916597723960876, "kl": 0.037905968725681305, "learning_rate": 4.980044589756266e-06, "loss": 0.0, "reward": 4.0625, "reward_std": 3.375, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5625, "step": 823 }, { "completion_length": 550.5, "epoch": 0.22838137472283815, "grad_norm": 0.3082723319530487, "kl": 0.019657133147120476, "learning_rate": 4.979989347243044e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 824 }, { "completion_length": 552.0, "epoch": 0.22865853658536586, "grad_norm": 0.2667745351791382, "kl": 0.040961142629384995, "learning_rate": 4.979934028678865e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 825 }, { "completion_length": 494.25, "epoch": 0.22893569844789358, "grad_norm": 0.3349105417728424, "kl": 0.02539118379354477, "learning_rate": 4.979878634065425e-06, "loss": -0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 826 }, { "completion_length": 574.0, "epoch": 0.2292128603104213, "grad_norm": 0.29252195358276367, "kl": 0.0317128524184227, "learning_rate": 4.979823163404424e-06, "loss": 0.0, "reward": 3.96875, "reward_std": 2.0268380641937256, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 827 }, { "completion_length": 527.75, "epoch": 0.229490022172949, "grad_norm": 0.29345282912254333, "kl": 0.022497624158859253, "learning_rate": 4.979767616697562e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 828 }, { "completion_length": 543.0, "epoch": 0.22976718403547672, "grad_norm": 0.29133546352386475, "kl": 0.036036793142557144, "learning_rate": 4.979711993946543e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 829 }, { "completion_length": 447.0, "epoch": 0.23004434589800443, "grad_norm": 0.0, "kl": 0.04546117037534714, "learning_rate": 4.979656295153073e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 830 }, { "completion_length": 549.5, "epoch": 0.23032150776053215, "grad_norm": 0.30063843727111816, "kl": 0.015371640212833881, "learning_rate": 4.979600520318858e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 831 }, { "completion_length": 468.5, "epoch": 0.23059866962305986, "grad_norm": 0.4528907835483551, "kl": 0.04475519433617592, "learning_rate": 4.979544669445611e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 832 }, { "completion_length": 578.75, "epoch": 0.23087583148558757, "grad_norm": 0.2575701177120209, "kl": 0.016100428998470306, "learning_rate": 4.979488742535043e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 833 }, { "completion_length": 545.5, "epoch": 0.2311529933481153, "grad_norm": 0.29647040367126465, "kl": 0.03143038600683212, "learning_rate": 4.979432739588871e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 834 }, { "completion_length": 612.0, "epoch": 0.23143015521064303, "grad_norm": 0.26260313391685486, "kl": 0.015427027828991413, "learning_rate": 4.97937666060881e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 835 }, { "completion_length": 518.0, "epoch": 0.23170731707317074, "grad_norm": 0.29376310110092163, "kl": 0.017429932951927185, "learning_rate": 4.979320505596581e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 836 }, { "completion_length": 490.75, "epoch": 0.23198447893569846, "grad_norm": 0.3007899224758148, "kl": 0.02289397083222866, "learning_rate": 4.979264274553906e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 837 }, { "completion_length": 484.0, "epoch": 0.23226164079822617, "grad_norm": 0.327243834733963, "kl": 0.027746189385652542, "learning_rate": 4.979207967482508e-06, "loss": 0.0, "reward": 4.71875, "reward_std": 1.980043649673462, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 838 }, { "completion_length": 480.0, "epoch": 0.23253880266075388, "grad_norm": 0.29068371653556824, "kl": 0.022175295278429985, "learning_rate": 4.979151584384116e-06, "loss": -0.0, "reward": 3.8125, "reward_std": 2.2395591735839844, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 839 }, { "completion_length": 429.25, "epoch": 0.2328159645232816, "grad_norm": 0.0, "kl": 0.024049045518040657, "learning_rate": 4.979095125260458e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 840 }, { "completion_length": 500.5, "epoch": 0.2330931263858093, "grad_norm": 0.2942284643650055, "kl": 0.015080940909683704, "learning_rate": 4.979038590113266e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 841 }, { "completion_length": 410.25, "epoch": 0.23337028824833703, "grad_norm": 0.3270406424999237, "kl": 0.026759495958685875, "learning_rate": 4.978981978944271e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 842 }, { "completion_length": 513.5, "epoch": 0.23364745011086474, "grad_norm": 0.0, "kl": 0.021457577124238014, "learning_rate": 4.978925291755212e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 843 }, { "completion_length": 455.25, "epoch": 0.23392461197339245, "grad_norm": 0.0, "kl": 0.02151050977408886, "learning_rate": 4.978868528547826e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 844 }, { "completion_length": 484.75, "epoch": 0.23420177383592017, "grad_norm": 0.0, "kl": 0.02882539853453636, "learning_rate": 4.978811689323855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 845 }, { "completion_length": 496.5, "epoch": 0.23447893569844788, "grad_norm": 0.0, "kl": 0.02477957122027874, "learning_rate": 4.9787547740850406e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 846 }, { "completion_length": 476.75, "epoch": 0.2347560975609756, "grad_norm": 0.3175237476825714, "kl": 0.02385808527469635, "learning_rate": 4.978697782833128e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 847 }, { "completion_length": 463.0, "epoch": 0.23503325942350334, "grad_norm": 0.41546574234962463, "kl": 0.023860683664679527, "learning_rate": 4.9786407155698665e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 848 }, { "completion_length": 490.75, "epoch": 0.23531042128603105, "grad_norm": 0.0, "kl": 0.02401557005941868, "learning_rate": 4.978583572297005e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 849 }, { "completion_length": 549.25, "epoch": 0.23558758314855877, "grad_norm": 0.3486635386943817, "kl": 0.021586956456303596, "learning_rate": 4.978526353016296e-06, "loss": -0.0, "reward": 3.125, "reward_std": 1.75, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 850 }, { "completion_length": 545.5, "epoch": 0.23586474501108648, "grad_norm": 0.28858447074890137, "kl": 0.02445022575557232, "learning_rate": 4.978469057729493e-06, "loss": -0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 851 }, { "completion_length": 552.0, "epoch": 0.2361419068736142, "grad_norm": 0.0, "kl": 0.0439331941306591, "learning_rate": 4.9784116864383555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 852 }, { "completion_length": 507.75, "epoch": 0.2364190687361419, "grad_norm": 0.37179216742515564, "kl": 0.026774538680911064, "learning_rate": 4.978354239144641e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 853 }, { "completion_length": 452.25, "epoch": 0.23669623059866962, "grad_norm": 0.3175719082355499, "kl": 0.022535236552357674, "learning_rate": 4.978296715850113e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 854 }, { "completion_length": 419.75, "epoch": 0.23697339246119734, "grad_norm": 0.0, "kl": 0.027371246367692947, "learning_rate": 4.978239116556533e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 855 }, { "completion_length": 528.5, "epoch": 0.23725055432372505, "grad_norm": 0.34755009412765503, "kl": 0.023199986666440964, "learning_rate": 4.9781814412656684e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 856 }, { "completion_length": 433.0, "epoch": 0.23752771618625276, "grad_norm": 0.36239928007125854, "kl": 0.024544021114706993, "learning_rate": 4.9781236899792875e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 857 }, { "completion_length": 495.25, "epoch": 0.23780487804878048, "grad_norm": 0.3218694031238556, "kl": 0.02656567469239235, "learning_rate": 4.978065862699162e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 858 }, { "completion_length": 498.5, "epoch": 0.2380820399113082, "grad_norm": 0.26501333713531494, "kl": 0.0234617181122303, "learning_rate": 4.978007959427066e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 859 }, { "completion_length": 510.25, "epoch": 0.23835920177383593, "grad_norm": 0.0, "kl": 0.037531957030296326, "learning_rate": 4.977949980164773e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 860 }, { "completion_length": 492.0, "epoch": 0.23863636363636365, "grad_norm": 0.35479581356048584, "kl": 0.02297566831111908, "learning_rate": 4.977891924914063e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 861 }, { "completion_length": 436.75, "epoch": 0.23891352549889136, "grad_norm": 0.393449991941452, "kl": 0.027653316035866737, "learning_rate": 4.977833793676715e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 862 }, { "completion_length": 522.25, "epoch": 0.23919068736141907, "grad_norm": 0.35903316736221313, "kl": 0.025861958041787148, "learning_rate": 4.977775586454512e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 863 }, { "completion_length": 473.5, "epoch": 0.2394678492239468, "grad_norm": 0.35696420073509216, "kl": 0.02760550193488598, "learning_rate": 4.977717303249239e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 864 }, { "completion_length": 450.5, "epoch": 0.2397450110864745, "grad_norm": 0.34691768884658813, "kl": 0.02901630476117134, "learning_rate": 4.977658944062684e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 865 }, { "completion_length": 498.0, "epoch": 0.24002217294900222, "grad_norm": 0.28248533606529236, "kl": 0.023602023720741272, "learning_rate": 4.977600508896635e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 866 }, { "completion_length": 491.5, "epoch": 0.24029933481152993, "grad_norm": 0.3479516804218292, "kl": 0.02562032826244831, "learning_rate": 4.977541997752886e-06, "loss": 0.0, "reward": 2.65625, "reward_std": 1.8579082489013672, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 867 }, { "completion_length": 480.25, "epoch": 0.24057649667405764, "grad_norm": 0.32932761311531067, "kl": 0.024307824671268463, "learning_rate": 4.977483410633229e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 868 }, { "completion_length": 511.75, "epoch": 0.24085365853658536, "grad_norm": 0.31682589650154114, "kl": 0.02515917271375656, "learning_rate": 4.9774247475394635e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 869 }, { "completion_length": 504.25, "epoch": 0.24113082039911307, "grad_norm": 0.30402863025665283, "kl": 0.020733339712023735, "learning_rate": 4.977366008473386e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 870 }, { "completion_length": 484.25, "epoch": 0.24140798226164079, "grad_norm": 0.3201925754547119, "kl": 0.028801102191209793, "learning_rate": 4.977307193436798e-06, "loss": -0.0, "reward": 4.375, "reward_std": 2.136000871658325, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 871 }, { "completion_length": 497.75, "epoch": 0.24168514412416853, "grad_norm": 0.28596213459968567, "kl": 0.02564484067261219, "learning_rate": 4.9772483024315045e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 872 }, { "completion_length": 476.25, "epoch": 0.24196230598669624, "grad_norm": 0.0, "kl": 0.021966423839330673, "learning_rate": 4.977189335459311e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 873 }, { "completion_length": 515.0, "epoch": 0.24223946784922396, "grad_norm": 0.29325148463249207, "kl": 0.02526821941137314, "learning_rate": 4.977130292522024e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 874 }, { "completion_length": 574.0, "epoch": 0.24251662971175167, "grad_norm": 0.29342690110206604, "kl": 0.020035311579704285, "learning_rate": 4.977071173621457e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 875 }, { "completion_length": 496.75, "epoch": 0.24279379157427938, "grad_norm": 0.2935672700405121, "kl": 0.02222505770623684, "learning_rate": 4.977011978759421e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 876 }, { "completion_length": 529.25, "epoch": 0.2430709534368071, "grad_norm": 0.0, "kl": 0.023809215053915977, "learning_rate": 4.976952707937731e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 877 }, { "completion_length": 529.0, "epoch": 0.2433481152993348, "grad_norm": 0.325204998254776, "kl": 0.028701603412628174, "learning_rate": 4.976893361158206e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 878 }, { "completion_length": 488.0, "epoch": 0.24362527716186252, "grad_norm": 0.0, "kl": 0.023472066968679428, "learning_rate": 4.976833938422665e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 879 }, { "completion_length": 430.5, "epoch": 0.24390243902439024, "grad_norm": 0.31564459204673767, "kl": 0.02996698021888733, "learning_rate": 4.976774439732931e-06, "loss": -0.0, "reward": 1.625, "reward_std": 0.25, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 880 }, { "completion_length": 606.0, "epoch": 0.24417960088691795, "grad_norm": 0.0, "kl": 0.021742356941103935, "learning_rate": 4.976714865090827e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 881 }, { "completion_length": 605.5, "epoch": 0.24445676274944567, "grad_norm": 0.0, "kl": 0.025490516796708107, "learning_rate": 4.976655214498181e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 882 }, { "completion_length": 573.0, "epoch": 0.24473392461197338, "grad_norm": 0.2716931700706482, "kl": 0.020444929599761963, "learning_rate": 4.976595487956824e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 883 }, { "completion_length": 511.25, "epoch": 0.24501108647450112, "grad_norm": 0.0, "kl": 0.05369368568062782, "learning_rate": 4.976535685468584e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 884 }, { "completion_length": 499.25, "epoch": 0.24528824833702884, "grad_norm": 0.3054690361022949, "kl": 0.027801262214779854, "learning_rate": 4.976475807035297e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 885 }, { "completion_length": 521.0, "epoch": 0.24556541019955655, "grad_norm": 0.335316926240921, "kl": 0.026878995820879936, "learning_rate": 4.9764158526588e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 886 }, { "completion_length": 508.75, "epoch": 0.24584257206208426, "grad_norm": 0.46343788504600525, "kl": 0.04349976032972336, "learning_rate": 4.9763558223409294e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 887 }, { "completion_length": 487.5, "epoch": 0.24611973392461198, "grad_norm": 0.3655325472354889, "kl": 0.030285073444247246, "learning_rate": 4.976295716083527e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 888 }, { "completion_length": 509.0, "epoch": 0.2463968957871397, "grad_norm": 0.3210846781730652, "kl": 0.02372911013662815, "learning_rate": 4.976235533888437e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 889 }, { "completion_length": 470.5, "epoch": 0.2466740576496674, "grad_norm": 0.3138309121131897, "kl": 0.042533062398433685, "learning_rate": 4.976175275757503e-06, "loss": 0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 890 }, { "completion_length": 505.0, "epoch": 0.24695121951219512, "grad_norm": 0.0, "kl": 0.029162060469388962, "learning_rate": 4.976114941692574e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 891 }, { "completion_length": 494.25, "epoch": 0.24722838137472283, "grad_norm": 0.37126967310905457, "kl": 0.04114137962460518, "learning_rate": 4.976054531695501e-06, "loss": -0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 892 }, { "completion_length": 552.0, "epoch": 0.24750554323725055, "grad_norm": 0.3012370467185974, "kl": 0.022133346647024155, "learning_rate": 4.975994045768136e-06, "loss": -0.0, "reward": 4.59375, "reward_std": 1.9185905456542969, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 893 }, { "completion_length": 505.5, "epoch": 0.24778270509977826, "grad_norm": 0.0, "kl": 0.02741900645196438, "learning_rate": 4.975933483912332e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 894 }, { "completion_length": 557.5, "epoch": 0.24805986696230597, "grad_norm": 0.2599855959415436, "kl": 0.0244450680911541, "learning_rate": 4.9758728461299485e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 895 }, { "completion_length": 578.5, "epoch": 0.24833702882483372, "grad_norm": 0.3464222848415375, "kl": 0.20065446197986603, "learning_rate": 4.975812132422843e-06, "loss": 0.0, "reward": 3.34375, "reward_std": 1.846660852432251, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 896 }, { "completion_length": 582.0, "epoch": 0.24861419068736143, "grad_norm": 0.0, "kl": 0.026625603437423706, "learning_rate": 4.97575134279288e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 897 }, { "completion_length": 469.75, "epoch": 0.24889135254988914, "grad_norm": 0.30046096444129944, "kl": 0.02855866216123104, "learning_rate": 4.9756904772419225e-06, "loss": 0.0, "reward": 5.625, "reward_std": 0.25, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 898 }, { "completion_length": 489.0, "epoch": 0.24916851441241686, "grad_norm": 0.2973101735115051, "kl": 0.022912748157978058, "learning_rate": 4.975629535771835e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 899 }, { "completion_length": 498.25, "epoch": 0.24944567627494457, "grad_norm": 0.2745434641838074, "kl": 0.026664115488529205, "learning_rate": 4.975568518384489e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 900 }, { "completion_length": 544.25, "epoch": 0.24972283813747229, "grad_norm": 0.30414801836013794, "kl": 0.02140066586434841, "learning_rate": 4.975507425081755e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 901 }, { "completion_length": 525.5, "epoch": 0.25, "grad_norm": 0.0, "kl": 0.023329472169280052, "learning_rate": 4.9754462558655055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 902 }, { "completion_length": 512.25, "epoch": 0.25027716186252774, "grad_norm": 0.2982703447341919, "kl": 0.027017321437597275, "learning_rate": 4.975385010737617e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 903 }, { "completion_length": 419.75, "epoch": 0.25055432372505543, "grad_norm": 0.39257869124412537, "kl": 0.025698095560073853, "learning_rate": 4.975323689699968e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 904 }, { "completion_length": 494.5, "epoch": 0.25083148558758317, "grad_norm": 0.3095596134662628, "kl": 0.022311516106128693, "learning_rate": 4.975262292754438e-06, "loss": 0.0, "reward": 1.46875, "reward_std": 0.5625, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 905 }, { "completion_length": 505.75, "epoch": 0.25110864745011086, "grad_norm": 0.2850932776927948, "kl": 0.024328920990228653, "learning_rate": 4.975200819902911e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 906 }, { "completion_length": 541.25, "epoch": 0.2513858093126386, "grad_norm": 0.32447949051856995, "kl": 0.03931991383433342, "learning_rate": 4.97513927114727e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 907 }, { "completion_length": 552.75, "epoch": 0.2516629711751663, "grad_norm": 0.2891790568828583, "kl": 0.017804397270083427, "learning_rate": 4.975077646489406e-06, "loss": 0.0, "reward": 3.65625, "reward_std": 2.4224965572357178, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 908 }, { "completion_length": 502.5, "epoch": 0.251940133037694, "grad_norm": 0.305430144071579, "kl": 0.019317205995321274, "learning_rate": 4.975015945931205e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 909 }, { "completion_length": 555.5, "epoch": 0.2522172949002217, "grad_norm": 0.2842849791049957, "kl": 0.019210411235690117, "learning_rate": 4.97495416947456e-06, "loss": 0.0, "reward": 4.21875, "reward_std": 1.7922722101211548, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 910 }, { "completion_length": 474.75, "epoch": 0.25249445676274945, "grad_norm": 0.33678269386291504, "kl": 0.041984591633081436, "learning_rate": 4.974892317121368e-06, "loss": 0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 911 }, { "completion_length": 606.25, "epoch": 0.25277161862527714, "grad_norm": 0.24010685086250305, "kl": 0.021748704835772514, "learning_rate": 4.974830388873524e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 912 }, { "completion_length": 569.75, "epoch": 0.2530487804878049, "grad_norm": 0.248738631606102, "kl": 0.021702682599425316, "learning_rate": 4.9747683847329265e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 913 }, { "completion_length": 418.75, "epoch": 0.25332594235033257, "grad_norm": 0.36413657665252686, "kl": 0.024998459964990616, "learning_rate": 4.974706304701476e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 914 }, { "completion_length": 581.25, "epoch": 0.2536031042128603, "grad_norm": 0.250516414642334, "kl": 0.019809704273939133, "learning_rate": 4.97464414878108e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 915 }, { "completion_length": 560.75, "epoch": 0.25388026607538805, "grad_norm": 0.28001299500465393, "kl": 0.019413581117987633, "learning_rate": 4.974581916973641e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 916 }, { "completion_length": 481.25, "epoch": 0.25415742793791574, "grad_norm": 0.28361326456069946, "kl": 0.023862022906541824, "learning_rate": 4.974519609281069e-06, "loss": -0.0, "reward": 5.59375, "reward_std": 0.3125, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 917 }, { "completion_length": 581.75, "epoch": 0.2544345898004435, "grad_norm": 0.2892283499240875, "kl": 0.0325164869427681, "learning_rate": 4.974457225705274e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 918 }, { "completion_length": 551.5, "epoch": 0.25471175166297116, "grad_norm": 0.27085039019584656, "kl": 0.020793817937374115, "learning_rate": 4.97439476624817e-06, "loss": -0.0, "reward": 3.5, "reward_std": 1.443375587463379, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 919 }, { "completion_length": 639.5, "epoch": 0.2549889135254989, "grad_norm": 0.30712801218032837, "kl": 0.02080165036022663, "learning_rate": 4.97433223091167e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 920 }, { "completion_length": 577.0, "epoch": 0.2552660753880266, "grad_norm": 0.30723774433135986, "kl": 0.020539073273539543, "learning_rate": 4.974269619697695e-06, "loss": 0.0, "reward": 5.375, "reward_std": 0.4787135720252991, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 921 }, { "completion_length": 497.5, "epoch": 0.25554323725055433, "grad_norm": 0.3320540189743042, "kl": 0.02715064212679863, "learning_rate": 4.974206932608164e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 922 }, { "completion_length": 515.75, "epoch": 0.255820399113082, "grad_norm": NaN, "kl": 0.02906513772904873, "learning_rate": 4.974144169644998e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 923 }, { "completion_length": 434.5, "epoch": 0.25609756097560976, "grad_norm": 0.3416861891746521, "kl": 0.028179749846458435, "learning_rate": 4.974144169644998e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 924 }, { "completion_length": 477.25, "epoch": 0.25637472283813745, "grad_norm": 0.3986247777938843, "kl": 0.03398977965116501, "learning_rate": 4.974081330810122e-06, "loss": -0.0, "reward": 4.375, "reward_std": 1.8874585628509521, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 925 }, { "completion_length": 528.25, "epoch": 0.2566518847006652, "grad_norm": 0.3015059530735016, "kl": 0.0328083299100399, "learning_rate": 4.974018416105464e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 926 }, { "completion_length": 453.0, "epoch": 0.25692904656319293, "grad_norm": 0.3448103666305542, "kl": 0.042109258472919464, "learning_rate": 4.973955425532953e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 927 }, { "completion_length": 481.75, "epoch": 0.2572062084257206, "grad_norm": 0.3704117238521576, "kl": 0.039452504366636276, "learning_rate": 4.97389235909452e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.9148542881011963, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 928 }, { "completion_length": 468.5, "epoch": 0.25748337028824836, "grad_norm": 0.3047181963920593, "kl": 0.030732927843928337, "learning_rate": 4.9738292167921e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 929 }, { "completion_length": 603.0, "epoch": 0.25776053215077604, "grad_norm": 0.29913195967674255, "kl": 0.019821129739284515, "learning_rate": 4.973765998627628e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 930 }, { "completion_length": 468.0, "epoch": 0.2580376940133038, "grad_norm": 0.39995941519737244, "kl": 0.026311878114938736, "learning_rate": 4.973702704603044e-06, "loss": -0.0, "reward": 3.125, "reward_std": 1.75, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 931 }, { "completion_length": 531.25, "epoch": 0.2583148558758315, "grad_norm": 0.2775128185749054, "kl": 0.03133278712630272, "learning_rate": 4.973639334720288e-06, "loss": -0.0, "reward": 5.59375, "reward_std": 0.3125, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 932 }, { "completion_length": 447.0, "epoch": 0.2585920177383592, "grad_norm": 0.3627470135688782, "kl": 0.030626559630036354, "learning_rate": 4.9735758889813045e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.40824830532073975, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 933 }, { "completion_length": 519.25, "epoch": 0.2588691796008869, "grad_norm": 0.3568917512893677, "kl": 0.04829642176628113, "learning_rate": 4.973512367388038e-06, "loss": 0.0, "reward": 4.59375, "reward_std": 1.9185905456542969, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 934 }, { "completion_length": 489.0, "epoch": 0.25914634146341464, "grad_norm": 0.33675333857536316, "kl": 0.031036602333188057, "learning_rate": 4.973448769942437e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 935 }, { "completion_length": 424.25, "epoch": 0.25942350332594233, "grad_norm": 0.34197914600372314, "kl": 0.02317541465163231, "learning_rate": 4.973385096646451e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 936 }, { "completion_length": 497.5, "epoch": 0.25970066518847007, "grad_norm": 0.301920622587204, "kl": 0.025513727217912674, "learning_rate": 4.973321347502033e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 937 }, { "completion_length": 507.0, "epoch": 0.25997782705099776, "grad_norm": 0.31800577044487, "kl": 0.028729621320962906, "learning_rate": 4.973257522511139e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 938 }, { "completion_length": 531.75, "epoch": 0.2602549889135255, "grad_norm": 0.0, "kl": 0.02940232865512371, "learning_rate": 4.973193621675724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 939 }, { "completion_length": 481.75, "epoch": 0.26053215077605324, "grad_norm": 0.3325009047985077, "kl": 0.0254697073251009, "learning_rate": 4.973129644997749e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 940 }, { "completion_length": 476.25, "epoch": 0.2608093126385809, "grad_norm": 0.3298417925834656, "kl": 0.026209669187664986, "learning_rate": 4.973065592479177e-06, "loss": 0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 941 }, { "completion_length": 517.25, "epoch": 0.26108647450110867, "grad_norm": 0.34049391746520996, "kl": 0.025155644863843918, "learning_rate": 4.97300146412197e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 942 }, { "completion_length": 551.75, "epoch": 0.26136363636363635, "grad_norm": 0.3424779176712036, "kl": 0.028401482850313187, "learning_rate": 4.972937259928096e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 943 }, { "completion_length": 461.5, "epoch": 0.2616407982261641, "grad_norm": 0.3363235592842102, "kl": 0.029920639470219612, "learning_rate": 4.9728729798995225e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 944 }, { "completion_length": 505.25, "epoch": 0.2619179600886918, "grad_norm": 0.33054468035697937, "kl": 0.02607949636876583, "learning_rate": 4.972808624038222e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 945 }, { "completion_length": 450.5, "epoch": 0.2621951219512195, "grad_norm": 0.3323685824871063, "kl": 0.02761324867606163, "learning_rate": 4.972744192346168e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 946 }, { "completion_length": 478.0, "epoch": 0.2624722838137472, "grad_norm": 0.31160616874694824, "kl": 0.027006682008504868, "learning_rate": 4.9726796848253365e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 947 }, { "completion_length": 535.5, "epoch": 0.26274944567627495, "grad_norm": 0.29140153527259827, "kl": 0.03517551347613335, "learning_rate": 4.972615101477704e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 948 }, { "completion_length": 465.5, "epoch": 0.26302660753880264, "grad_norm": 0.3432285785675049, "kl": 0.026582865044474602, "learning_rate": 4.972550442305253e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 949 }, { "completion_length": 522.0, "epoch": 0.2633037694013304, "grad_norm": 0.3043143153190613, "kl": 0.024394001811742783, "learning_rate": 4.972485707309965e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 950 }, { "completion_length": 507.75, "epoch": 0.2635809312638581, "grad_norm": 0.30322447419166565, "kl": 0.02712501399219036, "learning_rate": 4.972420896493826e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 951 }, { "completion_length": 509.5, "epoch": 0.2638580931263858, "grad_norm": 0.3824939429759979, "kl": 0.02922827936708927, "learning_rate": 4.972356009858823e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 952 }, { "completion_length": 508.25, "epoch": 0.26413525498891355, "grad_norm": 0.3301542401313782, "kl": 0.025922944769263268, "learning_rate": 4.972291047406945e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 953 }, { "completion_length": 518.0, "epoch": 0.26441241685144123, "grad_norm": 0.29013293981552124, "kl": 0.02337457984685898, "learning_rate": 4.972226009140187e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 954 }, { "completion_length": 467.5, "epoch": 0.264689578713969, "grad_norm": 0.3940070867538452, "kl": 0.0231831893324852, "learning_rate": 4.97216089506054e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 955 }, { "completion_length": 544.75, "epoch": 0.26496674057649666, "grad_norm": 0.2986680567264557, "kl": 0.02490820363163948, "learning_rate": 4.972095705170002e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 956 }, { "completion_length": 463.25, "epoch": 0.2652439024390244, "grad_norm": 0.3676975965499878, "kl": 0.031658466905355453, "learning_rate": 4.972030439470573e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 957 }, { "completion_length": 524.25, "epoch": 0.2655210643015521, "grad_norm": 0.28253063559532166, "kl": 0.024574317038059235, "learning_rate": 4.971965097964255e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 958 }, { "completion_length": 539.25, "epoch": 0.26579822616407983, "grad_norm": 0.2984132170677185, "kl": 0.028437616303563118, "learning_rate": 4.971899680653048e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 959 }, { "completion_length": 542.25, "epoch": 0.2660753880266075, "grad_norm": 0.33361247181892395, "kl": 0.02794615738093853, "learning_rate": 4.971834187538963e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 960 }, { "completion_length": 516.75, "epoch": 0.26635254988913526, "grad_norm": 0.0, "kl": 0.02906254678964615, "learning_rate": 4.971768618624005e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 961 }, { "completion_length": 424.5, "epoch": 0.26662971175166295, "grad_norm": 0.3514777421951294, "kl": 0.02764248289167881, "learning_rate": 4.971702973910185e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 962 }, { "completion_length": 461.25, "epoch": 0.2669068736141907, "grad_norm": 0.3126007914543152, "kl": 0.022827191278338432, "learning_rate": 4.971637253399519e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 963 }, { "completion_length": 536.0, "epoch": 0.26718403547671843, "grad_norm": 0.29111310839653015, "kl": 0.02440814860165119, "learning_rate": 4.971571457094017e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 964 }, { "completion_length": 514.5, "epoch": 0.2674611973392461, "grad_norm": 0.24143190681934357, "kl": 0.0216519832611084, "learning_rate": 4.971505584995703e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 965 }, { "completion_length": 568.0, "epoch": 0.26773835920177386, "grad_norm": 0.24711492657661438, "kl": 0.02212749607861042, "learning_rate": 4.971439637106592e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 966 }, { "completion_length": 442.5, "epoch": 0.26801552106430154, "grad_norm": 0.3106137216091156, "kl": 0.027863608673214912, "learning_rate": 4.971373613428709e-06, "loss": -0.0, "reward": 5.0, "reward_std": 0.5, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 967 }, { "completion_length": 503.0, "epoch": 0.2682926829268293, "grad_norm": 0.26791685819625854, "kl": 0.020996879786252975, "learning_rate": 4.971307513964078e-06, "loss": 0.0, "reward": 5.40625, "reward_std": 0.6875, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 968 }, { "completion_length": 563.5, "epoch": 0.26856984478935697, "grad_norm": 0.3540748059749603, "kl": 0.030340436846017838, "learning_rate": 4.971241338714726e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 969 }, { "completion_length": 446.0, "epoch": 0.2688470066518847, "grad_norm": 0.0, "kl": 0.03305058553814888, "learning_rate": 4.971175087682682e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 970 }, { "completion_length": 512.5, "epoch": 0.2691241685144124, "grad_norm": 0.31016090512275696, "kl": 0.02941289357841015, "learning_rate": 4.971108760869978e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 971 }, { "completion_length": 498.0, "epoch": 0.26940133037694014, "grad_norm": 0.0, "kl": 0.024294987320899963, "learning_rate": 4.9710423582786485e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 972 }, { "completion_length": 498.0, "epoch": 0.2696784922394678, "grad_norm": 0.28248390555381775, "kl": 0.027166960760951042, "learning_rate": 4.970975879910728e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 973 }, { "completion_length": 599.75, "epoch": 0.26995565410199557, "grad_norm": 0.3234221637248993, "kl": 0.017307503148913383, "learning_rate": 4.970909325768256e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 974 }, { "completion_length": 529.0, "epoch": 0.27023281596452325, "grad_norm": 0.32616057991981506, "kl": 0.01907181367278099, "learning_rate": 4.970842695853275e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 975 }, { "completion_length": 478.25, "epoch": 0.270509977827051, "grad_norm": 0.3626704812049866, "kl": 0.028212130069732666, "learning_rate": 4.970775990167826e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 976 }, { "completion_length": 505.0, "epoch": 0.27078713968957874, "grad_norm": 0.30439293384552, "kl": 0.02463732659816742, "learning_rate": 4.9707092087139565e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 977 }, { "completion_length": 587.75, "epoch": 0.2710643015521064, "grad_norm": 0.24851323664188385, "kl": 0.02331426739692688, "learning_rate": 4.970642351493713e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 978 }, { "completion_length": 450.0, "epoch": 0.27134146341463417, "grad_norm": 0.31119003891944885, "kl": 0.02660609781742096, "learning_rate": 4.970575418509146e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 979 }, { "completion_length": 473.5, "epoch": 0.27161862527716185, "grad_norm": 0.3050706088542938, "kl": 0.020126357674598694, "learning_rate": 4.970508409762308e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 980 }, { "completion_length": 551.0, "epoch": 0.2718957871396896, "grad_norm": 0.31312358379364014, "kl": 0.02405633218586445, "learning_rate": 4.970441325255255e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 981 }, { "completion_length": 447.0, "epoch": 0.2721729490022173, "grad_norm": 0.4288487434387207, "kl": 0.021807989105582237, "learning_rate": 4.970374164990042e-06, "loss": 0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 982 }, { "completion_length": 501.25, "epoch": 0.272450110864745, "grad_norm": 0.32902973890304565, "kl": 0.02064395695924759, "learning_rate": 4.9703069289687315e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 983 }, { "completion_length": 514.0, "epoch": 0.2727272727272727, "grad_norm": 0.30893322825431824, "kl": 0.019610535353422165, "learning_rate": 4.970239617193383e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 984 }, { "completion_length": 533.0, "epoch": 0.27300443458980045, "grad_norm": 0.332278847694397, "kl": 0.017158562317490578, "learning_rate": 4.970172229666061e-06, "loss": -0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 985 }, { "completion_length": 448.0, "epoch": 0.27328159645232813, "grad_norm": 0.33697038888931274, "kl": 0.02101190946996212, "learning_rate": 4.970104766388833e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 986 }, { "completion_length": 581.75, "epoch": 0.2735587583148559, "grad_norm": 0.0, "kl": 0.02001410722732544, "learning_rate": 4.970037227363766e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 987 }, { "completion_length": 540.25, "epoch": 0.2738359201773836, "grad_norm": 0.37273868918418884, "kl": 0.021844414994120598, "learning_rate": 4.969969612592934e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 988 }, { "completion_length": 628.5, "epoch": 0.2741130820399113, "grad_norm": 0.29334038496017456, "kl": 0.0222599096596241, "learning_rate": 4.969901922078408e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 989 }, { "completion_length": 537.25, "epoch": 0.27439024390243905, "grad_norm": 0.29427263140678406, "kl": 0.02367965318262577, "learning_rate": 4.969834155822265e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 990 }, { "completion_length": 492.0, "epoch": 0.27466740576496673, "grad_norm": 0.2976175546646118, "kl": 0.022305725142359734, "learning_rate": 4.969766313826582e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 991 }, { "completion_length": 570.0, "epoch": 0.2749445676274945, "grad_norm": 0.3072894513607025, "kl": 0.019768282771110535, "learning_rate": 4.96969839609344e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 992 }, { "completion_length": 481.25, "epoch": 0.27522172949002216, "grad_norm": 0.29319047927856445, "kl": 0.020777828991413116, "learning_rate": 4.969630402624923e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 993 }, { "completion_length": 567.5, "epoch": 0.2754988913525499, "grad_norm": 0.297734797000885, "kl": 0.016030076891183853, "learning_rate": 4.969562333423114e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 994 }, { "completion_length": 603.75, "epoch": 0.2757760532150776, "grad_norm": 0.2995712161064148, "kl": 0.019474955275654793, "learning_rate": 4.969494188490102e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 995 }, { "completion_length": 460.5, "epoch": 0.27605321507760533, "grad_norm": 0.37485018372535706, "kl": 0.02342281863093376, "learning_rate": 4.969425967827976e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 996 }, { "completion_length": 587.0, "epoch": 0.276330376940133, "grad_norm": 0.33553841710090637, "kl": 0.04152394458651543, "learning_rate": 4.969357671438827e-06, "loss": -0.0, "reward": 2.71875, "reward_std": 2.0216922760009766, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 997 }, { "completion_length": 486.75, "epoch": 0.27660753880266076, "grad_norm": 0.0, "kl": 0.03520695120096207, "learning_rate": 4.969289299324751e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 998 }, { "completion_length": 571.25, "epoch": 0.27688470066518844, "grad_norm": 0.4560244679450989, "kl": 0.016956893727183342, "learning_rate": 4.9692208514878445e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 999 }, { "completion_length": 480.0, "epoch": 0.2771618625277162, "grad_norm": 0.3353462517261505, "kl": 0.024214675650000572, "learning_rate": 4.969152327930207e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1000 }, { "completion_length": 482.5, "epoch": 0.2774390243902439, "grad_norm": 0.3155367970466614, "kl": 0.02265956997871399, "learning_rate": 4.969083728653937e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1001 }, { "completion_length": 462.75, "epoch": 0.2777161862527716, "grad_norm": 0.3537786304950714, "kl": 0.020038723945617676, "learning_rate": 4.969015053661142e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1002 }, { "completion_length": 538.25, "epoch": 0.27799334811529935, "grad_norm": 0.2977961003780365, "kl": 0.01621602475643158, "learning_rate": 4.9689463029539256e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1003 }, { "completion_length": 509.0, "epoch": 0.27827050997782704, "grad_norm": 0.29909756779670715, "kl": 0.022566745057702065, "learning_rate": 4.9688774765343965e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1004 }, { "completion_length": 402.25, "epoch": 0.2785476718403548, "grad_norm": 0.0, "kl": 0.02703283727169037, "learning_rate": 4.9688085744046655e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1005 }, { "completion_length": 463.5, "epoch": 0.27882483370288247, "grad_norm": 0.2987987697124481, "kl": 0.022834748029708862, "learning_rate": 4.968739596566846e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1006 }, { "completion_length": 544.75, "epoch": 0.2791019955654102, "grad_norm": 0.2873808741569519, "kl": 0.017148248851299286, "learning_rate": 4.968670543023052e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1007 }, { "completion_length": 488.0, "epoch": 0.2793791574279379, "grad_norm": 0.3106774687767029, "kl": 0.023904765024781227, "learning_rate": 4.968601413775402e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1008 }, { "completion_length": 614.0, "epoch": 0.27965631929046564, "grad_norm": 0.29677215218544006, "kl": 0.020153580233454704, "learning_rate": 4.968532208826015e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1009 }, { "completion_length": 480.75, "epoch": 0.2799334811529933, "grad_norm": 0.3329157829284668, "kl": 0.0230264775454998, "learning_rate": 4.968462928177016e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1010 }, { "completion_length": 478.75, "epoch": 0.28021064301552107, "grad_norm": 0.0, "kl": 0.026932155713438988, "learning_rate": 4.968393571830526e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1011 }, { "completion_length": 492.0, "epoch": 0.2804878048780488, "grad_norm": 0.2831816077232361, "kl": 0.02054126001894474, "learning_rate": 4.968324139788675e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1012 }, { "completion_length": 520.0, "epoch": 0.2807649667405765, "grad_norm": 0.30883270502090454, "kl": 0.03048417530953884, "learning_rate": 4.968254632053589e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1013 }, { "completion_length": 489.75, "epoch": 0.28104212860310424, "grad_norm": 0.3475154936313629, "kl": 0.019446592777967453, "learning_rate": 4.968185048627402e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1014 }, { "completion_length": 495.0, "epoch": 0.2813192904656319, "grad_norm": 0.3206675946712494, "kl": 0.02769070491194725, "learning_rate": 4.968115389512246e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1015 }, { "completion_length": 431.0, "epoch": 0.28159645232815966, "grad_norm": 0.38152778148651123, "kl": 0.03455011546611786, "learning_rate": 4.968045654710259e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1016 }, { "completion_length": 499.75, "epoch": 0.28187361419068735, "grad_norm": 0.32047414779663086, "kl": 0.02214740961790085, "learning_rate": 4.967975844223578e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1017 }, { "completion_length": 434.5, "epoch": 0.2821507760532151, "grad_norm": 0.0, "kl": 0.022533560171723366, "learning_rate": 4.967905958054345e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1018 }, { "completion_length": 506.0, "epoch": 0.2824279379157428, "grad_norm": 0.31036797165870667, "kl": 0.019228214398026466, "learning_rate": 4.967835996204701e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1019 }, { "completion_length": 444.5, "epoch": 0.2827050997782705, "grad_norm": 0.3600473701953888, "kl": 0.01904992386698723, "learning_rate": 4.967765958676795e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1020 }, { "completion_length": 448.5, "epoch": 0.2829822616407982, "grad_norm": 0.3129110038280487, "kl": 0.04018457606434822, "learning_rate": 4.967695845472772e-06, "loss": 0.0, "reward": 4.0, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1021 }, { "completion_length": 486.75, "epoch": 0.28325942350332595, "grad_norm": 0.2989726662635803, "kl": 0.021018987521529198, "learning_rate": 4.967625656594782e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1022 }, { "completion_length": 474.25, "epoch": 0.28353658536585363, "grad_norm": 0.3441036641597748, "kl": 0.03325868770480156, "learning_rate": 4.96755539204498e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1023 }, { "completion_length": 445.75, "epoch": 0.2838137472283814, "grad_norm": 0.5932043194770813, "kl": 0.025172177702188492, "learning_rate": 4.967485051825517e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1024 }, { "completion_length": 464.5, "epoch": 0.2840909090909091, "grad_norm": 0.32358694076538086, "kl": 0.020389506593346596, "learning_rate": 4.967414635938552e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1025 }, { "completion_length": 582.0, "epoch": 0.2843680709534368, "grad_norm": 0.304472953081131, "kl": 0.021209221333265305, "learning_rate": 4.967344144386246e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1026 }, { "completion_length": 457.0, "epoch": 0.28464523281596454, "grad_norm": 0.34465697407722473, "kl": 0.03037424385547638, "learning_rate": 4.967273577170758e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5545631647109985, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1027 }, { "completion_length": 442.0, "epoch": 0.28492239467849223, "grad_norm": 0.0, "kl": 0.02800491452217102, "learning_rate": 4.967202934294253e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1028 }, { "completion_length": 392.25, "epoch": 0.28519955654101997, "grad_norm": 0.3652357757091522, "kl": 0.030357858166098595, "learning_rate": 4.9671322157588965e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1029 }, { "completion_length": 413.25, "epoch": 0.28547671840354766, "grad_norm": 0.3210616409778595, "kl": 0.03111010044813156, "learning_rate": 4.967061421566859e-06, "loss": -0.0, "reward": 2.90625, "reward_std": 1.907701015472412, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 1030 }, { "completion_length": 426.0, "epoch": 0.2857538802660754, "grad_norm": 0.3739263117313385, "kl": 0.05296541377902031, "learning_rate": 4.96699055172031e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1031 }, { "completion_length": 445.25, "epoch": 0.2860310421286031, "grad_norm": 0.0, "kl": 0.03957776352763176, "learning_rate": 4.966919606221423e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1032 }, { "completion_length": 493.5, "epoch": 0.2863082039911308, "grad_norm": 0.361483097076416, "kl": 0.037673220038414, "learning_rate": 4.966848585072374e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1033 }, { "completion_length": 449.75, "epoch": 0.2865853658536585, "grad_norm": 0.32363593578338623, "kl": 0.023143110796809196, "learning_rate": 4.966777488275341e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1034 }, { "completion_length": 547.25, "epoch": 0.28686252771618626, "grad_norm": 0.4354513883590698, "kl": 0.024309191852808, "learning_rate": 4.966706315832503e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1035 }, { "completion_length": 495.25, "epoch": 0.28713968957871394, "grad_norm": 0.33565303683280945, "kl": 0.02751338854432106, "learning_rate": 4.966635067746045e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1036 }, { "completion_length": 492.0, "epoch": 0.2874168514412417, "grad_norm": 0.29692068696022034, "kl": 0.025148700922727585, "learning_rate": 4.96656374401815e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1037 }, { "completion_length": 530.75, "epoch": 0.2876940133037694, "grad_norm": 0.3045816719532013, "kl": 0.024219360202550888, "learning_rate": 4.966492344651006e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1038 }, { "completion_length": 502.25, "epoch": 0.2879711751662971, "grad_norm": 0.3335307240486145, "kl": 0.024820668622851372, "learning_rate": 4.966420869646801e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1039 }, { "completion_length": 495.0, "epoch": 0.28824833702882485, "grad_norm": 0.3368712067604065, "kl": 0.029247086495161057, "learning_rate": 4.966349319007728e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1040 }, { "completion_length": 475.25, "epoch": 0.28852549889135254, "grad_norm": 0.3666885495185852, "kl": 0.04959164932370186, "learning_rate": 4.966277692735982e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.041241407394409, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1041 }, { "completion_length": 443.0, "epoch": 0.2888026607538803, "grad_norm": 0.32575345039367676, "kl": 0.02477886900305748, "learning_rate": 4.966205990833758e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1042 }, { "completion_length": 483.0, "epoch": 0.28907982261640797, "grad_norm": 0.32656869292259216, "kl": 0.03952339291572571, "learning_rate": 4.9661342133032565e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1043 }, { "completion_length": 522.75, "epoch": 0.2893569844789357, "grad_norm": 0.34397801756858826, "kl": 0.028796490281820297, "learning_rate": 4.966062360146676e-06, "loss": 0.0, "reward": 4.34375, "reward_std": 1.7776827812194824, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 1044 }, { "completion_length": 476.75, "epoch": 0.2896341463414634, "grad_norm": 0.0, "kl": 0.027844969183206558, "learning_rate": 4.965990431366222e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1045 }, { "completion_length": 482.0, "epoch": 0.28991130820399114, "grad_norm": 0.31828585267066956, "kl": 0.034619495272636414, "learning_rate": 4.9659184269641e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1046 }, { "completion_length": 471.75, "epoch": 0.2901884700665188, "grad_norm": 0.4062064588069916, "kl": 0.0322834849357605, "learning_rate": 4.965846346942518e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1047 }, { "completion_length": 462.5, "epoch": 0.29046563192904656, "grad_norm": 0.33396461606025696, "kl": 0.0359039232134819, "learning_rate": 4.965774191303686e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1048 }, { "completion_length": 484.25, "epoch": 0.2907427937915743, "grad_norm": 0.35253387689590454, "kl": 0.0367174930870533, "learning_rate": 4.965701960049817e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1049 }, { "completion_length": 465.0, "epoch": 0.291019955654102, "grad_norm": 0.4401179254055023, "kl": 0.02744162082672119, "learning_rate": 4.965629653183126e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1050 }, { "completion_length": 424.5, "epoch": 0.29129711751662973, "grad_norm": 0.0, "kl": 0.03651702031493187, "learning_rate": 4.965557270705831e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1051 }, { "completion_length": 361.75, "epoch": 0.2915742793791574, "grad_norm": 0.4160190224647522, "kl": 0.03376837447285652, "learning_rate": 4.96548481262015e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1052 }, { "completion_length": 436.75, "epoch": 0.29185144124168516, "grad_norm": 0.3303115665912628, "kl": 0.05793340504169464, "learning_rate": 4.9654122789283055e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1053 }, { "completion_length": 478.5, "epoch": 0.29212860310421285, "grad_norm": 0.3512338697910309, "kl": 0.04232212156057358, "learning_rate": 4.965339669632523e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1054 }, { "completion_length": 459.5, "epoch": 0.2924057649667406, "grad_norm": 0.33677735924720764, "kl": 0.03593214973807335, "learning_rate": 4.9652669847350275e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1055 }, { "completion_length": 389.75, "epoch": 0.2926829268292683, "grad_norm": 0.38832664489746094, "kl": 0.037278320640325546, "learning_rate": 4.96519422423805e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1056 }, { "completion_length": 467.25, "epoch": 0.292960088691796, "grad_norm": 0.3625565469264984, "kl": 0.0398838147521019, "learning_rate": 4.96512138814382e-06, "loss": -0.0, "reward": 4.375, "reward_std": 1.4930394887924194, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1057 }, { "completion_length": 483.5, "epoch": 0.2932372505543237, "grad_norm": 0.35206839442253113, "kl": 0.03982047736644745, "learning_rate": 4.965048476454572e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1058 }, { "completion_length": 495.0, "epoch": 0.29351441241685144, "grad_norm": 0.4816044270992279, "kl": 0.03457895666360855, "learning_rate": 4.964975489172541e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1059 }, { "completion_length": 520.25, "epoch": 0.29379157427937913, "grad_norm": 0.0, "kl": 0.03293837234377861, "learning_rate": 4.964902426299966e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1060 }, { "completion_length": 455.5, "epoch": 0.2940687361419069, "grad_norm": 0.34740912914276123, "kl": 0.07394766807556152, "learning_rate": 4.964829287839088e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1061 }, { "completion_length": 439.75, "epoch": 0.2943458980044346, "grad_norm": 0.36179783940315247, "kl": 0.03114972449839115, "learning_rate": 4.964756073792148e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1062 }, { "completion_length": 465.5, "epoch": 0.2946230598669623, "grad_norm": 0.390921413898468, "kl": 0.0378875732421875, "learning_rate": 4.964682784161394e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1063 }, { "completion_length": 477.5, "epoch": 0.29490022172949004, "grad_norm": 0.3205554783344269, "kl": 0.03287122771143913, "learning_rate": 4.9646094189490716e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1064 }, { "completion_length": 405.0, "epoch": 0.29517738359201773, "grad_norm": 0.37201449275016785, "kl": 0.04095079377293587, "learning_rate": 4.964535978157429e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1065 }, { "completion_length": 543.25, "epoch": 0.29545454545454547, "grad_norm": 0.3353775441646576, "kl": 0.03817274048924446, "learning_rate": 4.964462461788721e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1066 }, { "completion_length": 434.25, "epoch": 0.29573170731707316, "grad_norm": 0.3651259243488312, "kl": 0.045052334666252136, "learning_rate": 4.964388869845202e-06, "loss": 0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1067 }, { "completion_length": 543.0, "epoch": 0.2960088691796009, "grad_norm": 0.31074440479278564, "kl": 0.028109660372138023, "learning_rate": 4.964315202329127e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1068 }, { "completion_length": 474.5, "epoch": 0.2962860310421286, "grad_norm": 0.44482308626174927, "kl": 0.039434902369976044, "learning_rate": 4.964241459242757e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1069 }, { "completion_length": 514.25, "epoch": 0.2965631929046563, "grad_norm": 0.38938137888908386, "kl": 0.03500547632575035, "learning_rate": 4.964167640588352e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1070 }, { "completion_length": 502.25, "epoch": 0.296840354767184, "grad_norm": 0.3133224844932556, "kl": 0.03191190958023071, "learning_rate": 4.964093746368176e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1071 }, { "completion_length": 496.5, "epoch": 0.29711751662971175, "grad_norm": 0.351371169090271, "kl": 0.044664911925792694, "learning_rate": 4.964019776584496e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1072 }, { "completion_length": 445.5, "epoch": 0.2973946784922395, "grad_norm": 0.0, "kl": 0.03828996419906616, "learning_rate": 4.963945731239578e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1073 }, { "completion_length": 445.75, "epoch": 0.2976718403547672, "grad_norm": 0.43228480219841003, "kl": 0.06774173676967621, "learning_rate": 4.963871610335695e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1074 }, { "completion_length": 534.5, "epoch": 0.2979490022172949, "grad_norm": 0.30523794889450073, "kl": 0.03228898346424103, "learning_rate": 4.963797413875119e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1075 }, { "completion_length": 462.75, "epoch": 0.2982261640798226, "grad_norm": 0.33238041400909424, "kl": 0.03792896866798401, "learning_rate": 4.963723141860126e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1076 }, { "completion_length": 502.75, "epoch": 0.29850332594235035, "grad_norm": 0.3622886836528778, "kl": 0.037453994154930115, "learning_rate": 4.963648794292992e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1077 }, { "completion_length": 508.0, "epoch": 0.29878048780487804, "grad_norm": 0.5741526484489441, "kl": 0.03601735457777977, "learning_rate": 4.9635743711759996e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1078 }, { "completion_length": 469.25, "epoch": 0.2990576496674058, "grad_norm": 0.0, "kl": 0.04271135479211807, "learning_rate": 4.963499872511428e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1079 }, { "completion_length": 571.75, "epoch": 0.29933481152993346, "grad_norm": 0.3605673015117645, "kl": 0.038134679198265076, "learning_rate": 4.963425298301564e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1080 }, { "completion_length": 444.5, "epoch": 0.2996119733924612, "grad_norm": 0.4661386013031006, "kl": 0.04942169040441513, "learning_rate": 4.963350648548693e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1081 }, { "completion_length": 461.75, "epoch": 0.2998891352549889, "grad_norm": 0.6620126962661743, "kl": 0.03972228616476059, "learning_rate": 4.9632759232551056e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1082 }, { "completion_length": 401.0, "epoch": 0.30016629711751663, "grad_norm": 0.5078964829444885, "kl": 0.03811978921294212, "learning_rate": 4.963201122423093e-06, "loss": 0.0, "reward": 3.75, "reward_std": 1.7795131206512451, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1083 }, { "completion_length": 431.25, "epoch": 0.3004434589800443, "grad_norm": 0.4329546093940735, "kl": 0.06719096750020981, "learning_rate": 4.9631262460549475e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1084 }, { "completion_length": 456.0, "epoch": 0.30072062084257206, "grad_norm": 0.36193811893463135, "kl": 0.04905031993985176, "learning_rate": 4.963051294152967e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1085 }, { "completion_length": 475.5, "epoch": 0.3009977827050998, "grad_norm": 0.34419408440589905, "kl": 0.03968197479844093, "learning_rate": 4.962976266719449e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1086 }, { "completion_length": 465.75, "epoch": 0.3012749445676275, "grad_norm": 0.0, "kl": 0.03462084382772446, "learning_rate": 4.962901163756695e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1087 }, { "completion_length": 451.75, "epoch": 0.30155210643015523, "grad_norm": 0.0, "kl": 0.05594726651906967, "learning_rate": 4.962825985267007e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1088 }, { "completion_length": 457.0, "epoch": 0.3018292682926829, "grad_norm": 0.3677133023738861, "kl": 0.043869804590940475, "learning_rate": 4.962750731252692e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1089 }, { "completion_length": 535.25, "epoch": 0.30210643015521066, "grad_norm": 0.30385148525238037, "kl": 0.04264413192868233, "learning_rate": 4.962675401716056e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1090 }, { "completion_length": 468.5, "epoch": 0.30238359201773835, "grad_norm": 0.33451977372169495, "kl": 0.06834236532449722, "learning_rate": 4.962599996659411e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1091 }, { "completion_length": 487.0, "epoch": 0.3026607538802661, "grad_norm": 0.37245920300483704, "kl": 0.03832004964351654, "learning_rate": 4.9625245160850674e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1092 }, { "completion_length": 542.25, "epoch": 0.3029379157427938, "grad_norm": 0.3595799207687378, "kl": 0.04573914408683777, "learning_rate": 4.962448959995341e-06, "loss": 0.0, "reward": 4.5625, "reward_std": 2.375, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 1093 }, { "completion_length": 517.5, "epoch": 0.3032150776053215, "grad_norm": 0.3271923065185547, "kl": 0.03597258776426315, "learning_rate": 4.962373328392549e-06, "loss": -0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1094 }, { "completion_length": 508.5, "epoch": 0.3034922394678492, "grad_norm": 0.3763786256313324, "kl": 0.03348341956734657, "learning_rate": 4.96229762127901e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1095 }, { "completion_length": 533.5, "epoch": 0.30376940133037694, "grad_norm": 0.33512064814567566, "kl": 0.02956058830022812, "learning_rate": 4.9622218386570455e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1096 }, { "completion_length": 501.75, "epoch": 0.30404656319290463, "grad_norm": 0.30794987082481384, "kl": 0.04145538806915283, "learning_rate": 4.9621459805289795e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1097 }, { "completion_length": 479.75, "epoch": 0.30432372505543237, "grad_norm": 0.38096490502357483, "kl": 0.033310189843177795, "learning_rate": 4.9620700468971395e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1098 }, { "completion_length": 485.0, "epoch": 0.3046008869179601, "grad_norm": 0.3316032886505127, "kl": 0.03748127818107605, "learning_rate": 4.961994037763853e-06, "loss": -0.0, "reward": 2.0, "reward_std": 0.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1099 }, { "completion_length": 534.5, "epoch": 0.3048780487804878, "grad_norm": 0.2861294150352478, "kl": 0.028865870088338852, "learning_rate": 4.9619179531314494e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1100 }, { "completion_length": 632.25, "epoch": 0.30515521064301554, "grad_norm": 0.3039053976535797, "kl": 0.05115751922130585, "learning_rate": 4.961841793002266e-06, "loss": -0.0, "reward": 3.125, "reward_std": 1.7969882488250732, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1101 }, { "completion_length": 515.5, "epoch": 0.3054323725055432, "grad_norm": 0.3199681341648102, "kl": 0.04196203500032425, "learning_rate": 4.961765557378634e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1102 }, { "completion_length": 544.25, "epoch": 0.30570953436807097, "grad_norm": 0.3115306496620178, "kl": 0.03211309388279915, "learning_rate": 4.961689246262894e-06, "loss": 0.0, "reward": 4.375, "reward_std": 2.136000871658325, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1103 }, { "completion_length": 539.25, "epoch": 0.30598669623059865, "grad_norm": 0.3951904773712158, "kl": 0.035172417759895325, "learning_rate": 4.961612859657384e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1104 }, { "completion_length": 464.5, "epoch": 0.3062638580931264, "grad_norm": 0.3288668692111969, "kl": 0.03647911548614502, "learning_rate": 4.96153639756445e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1105 }, { "completion_length": 552.75, "epoch": 0.3065410199556541, "grad_norm": 0.31002286076545715, "kl": 0.03557100519537926, "learning_rate": 4.961459859986432e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1106 }, { "completion_length": 446.25, "epoch": 0.3068181818181818, "grad_norm": 0.0, "kl": 0.038396842777729034, "learning_rate": 4.96138324692568e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1107 }, { "completion_length": 440.0, "epoch": 0.3070953436807095, "grad_norm": 0.343617707490921, "kl": 0.04413169249892235, "learning_rate": 4.961306558384543e-06, "loss": 0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1108 }, { "completion_length": 456.5, "epoch": 0.30737250554323725, "grad_norm": 0.33297204971313477, "kl": 0.03945549577474594, "learning_rate": 4.961229794365373e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1109 }, { "completion_length": 507.5, "epoch": 0.307649667405765, "grad_norm": 0.38084739446640015, "kl": 0.03214975818991661, "learning_rate": 4.961152954870524e-06, "loss": 0.0, "reward": 5.0, "reward_std": 0.5, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1110 }, { "completion_length": 512.75, "epoch": 0.3079268292682927, "grad_norm": 0.33698058128356934, "kl": 0.034554168581962585, "learning_rate": 4.961076039902352e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1111 }, { "completion_length": 433.0, "epoch": 0.3082039911308204, "grad_norm": 0.34114569425582886, "kl": 0.03633279725909233, "learning_rate": 4.960999049463215e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1112 }, { "completion_length": 471.75, "epoch": 0.3084811529933481, "grad_norm": 0.0, "kl": 0.033922865986824036, "learning_rate": 4.960921983555475e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1113 }, { "completion_length": 498.0, "epoch": 0.30875831485587585, "grad_norm": 0.4313189685344696, "kl": 0.03922605141997337, "learning_rate": 4.9608448421814944e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1114 }, { "completion_length": 502.25, "epoch": 0.30903547671840353, "grad_norm": 0.4542483985424042, "kl": 0.038901373744010925, "learning_rate": 4.96076762534364e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1115 }, { "completion_length": 510.5, "epoch": 0.3093126385809313, "grad_norm": 0.3452955186367035, "kl": 0.040303219109773636, "learning_rate": 4.960690333044279e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1116 }, { "completion_length": 483.25, "epoch": 0.30958980044345896, "grad_norm": 0.36007940769195557, "kl": 0.03261406347155571, "learning_rate": 4.960612965285781e-06, "loss": 0.0, "reward": 3.71875, "reward_std": 2.0113608837127686, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 1117 }, { "completion_length": 418.0, "epoch": 0.3098669623059867, "grad_norm": 0.4036072790622711, "kl": 0.0407077856361866, "learning_rate": 4.960535522070521e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1118 }, { "completion_length": 519.75, "epoch": 0.3101441241685144, "grad_norm": 0.2919016480445862, "kl": 0.03211240842938423, "learning_rate": 4.960458003400871e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1119 }, { "completion_length": 510.25, "epoch": 0.31042128603104213, "grad_norm": 0.3502099812030792, "kl": 0.038386594504117966, "learning_rate": 4.960380409279209e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1120 }, { "completion_length": 408.0, "epoch": 0.3106984478935698, "grad_norm": 0.407134085893631, "kl": 0.057649362832307816, "learning_rate": 4.960302739707915e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1121 }, { "completion_length": 487.0, "epoch": 0.31097560975609756, "grad_norm": 0.0, "kl": 0.046428535133600235, "learning_rate": 4.960224994689371e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1122 }, { "completion_length": 463.75, "epoch": 0.3112527716186253, "grad_norm": 0.37614232301712036, "kl": 0.03551040217280388, "learning_rate": 4.96014717422596e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1123 }, { "completion_length": 421.5, "epoch": 0.311529933481153, "grad_norm": 0.39075303077697754, "kl": 0.03994963690638542, "learning_rate": 4.960069278320069e-06, "loss": -0.0, "reward": 4.0, "reward_std": 2.020725965499878, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1124 }, { "completion_length": 482.75, "epoch": 0.31180709534368073, "grad_norm": 0.4470120966434479, "kl": 0.04248597472906113, "learning_rate": 4.959991306974088e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1125 }, { "completion_length": 467.0, "epoch": 0.3120842572062084, "grad_norm": 0.314059853553772, "kl": 0.04295017570257187, "learning_rate": 4.959913260190406e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1126 }, { "completion_length": 480.75, "epoch": 0.31236141906873616, "grad_norm": 0.3494327962398529, "kl": 0.03913170471787453, "learning_rate": 4.959835137971417e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1127 }, { "completion_length": 583.0, "epoch": 0.31263858093126384, "grad_norm": 0.29753169417381287, "kl": 0.0366206020116806, "learning_rate": 4.959756940319518e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1128 }, { "completion_length": 456.5, "epoch": 0.3129157427937916, "grad_norm": 0.337272584438324, "kl": 0.03725085034966469, "learning_rate": 4.959678667237106e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1129 }, { "completion_length": 422.5, "epoch": 0.31319290465631927, "grad_norm": 0.0, "kl": 0.03769221901893616, "learning_rate": 4.959600318726581e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1130 }, { "completion_length": 517.0, "epoch": 0.313470066518847, "grad_norm": 0.3837946057319641, "kl": 0.04610493779182434, "learning_rate": 4.959521894790344e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1131 }, { "completion_length": 474.75, "epoch": 0.3137472283813747, "grad_norm": 0.334856778383255, "kl": 0.03916240483522415, "learning_rate": 4.959443395430804e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1132 }, { "completion_length": 530.0, "epoch": 0.31402439024390244, "grad_norm": 0.35119956731796265, "kl": 0.04222939535975456, "learning_rate": 4.9593648206503665e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1133 }, { "completion_length": 393.0, "epoch": 0.3143015521064302, "grad_norm": 0.36375173926353455, "kl": 0.054555218666791916, "learning_rate": 4.9592861704514395e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1134 }, { "completion_length": 481.0, "epoch": 0.31457871396895787, "grad_norm": 0.4394485056400299, "kl": 0.041023362427949905, "learning_rate": 4.959207444836436e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1135 }, { "completion_length": 470.0, "epoch": 0.3148558758314856, "grad_norm": 0.3402627408504486, "kl": 0.04070618003606796, "learning_rate": 4.959128643807771e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1136 }, { "completion_length": 484.5, "epoch": 0.3151330376940133, "grad_norm": 0.33143943548202515, "kl": 0.04048031568527222, "learning_rate": 4.95904976736786e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1137 }, { "completion_length": 455.25, "epoch": 0.31541019955654104, "grad_norm": 0.3427177369594574, "kl": 0.041785869747400284, "learning_rate": 4.958970815519121e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1138 }, { "completion_length": 541.25, "epoch": 0.3156873614190687, "grad_norm": 0.3289494514465332, "kl": 0.0454472191631794, "learning_rate": 4.958891788263977e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1139 }, { "completion_length": 485.25, "epoch": 0.31596452328159647, "grad_norm": 0.3363562226295471, "kl": 0.057065967470407486, "learning_rate": 4.95881268560485e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1140 }, { "completion_length": 555.0, "epoch": 0.31624168514412415, "grad_norm": 0.3529166281223297, "kl": 0.03581266477704048, "learning_rate": 4.958733507544167e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1141 }, { "completion_length": 520.25, "epoch": 0.3165188470066519, "grad_norm": 0.28561487793922424, "kl": 0.030292684212327003, "learning_rate": 4.958654254084356e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1142 }, { "completion_length": 427.75, "epoch": 0.3167960088691796, "grad_norm": 0.3731602132320404, "kl": 0.045254990458488464, "learning_rate": 4.958574925227846e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1143 }, { "completion_length": 415.25, "epoch": 0.3170731707317073, "grad_norm": 0.4558444619178772, "kl": 0.04705418646335602, "learning_rate": 4.95849552097707e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1144 }, { "completion_length": 425.0, "epoch": 0.317350332594235, "grad_norm": 0.0, "kl": 0.04962959140539169, "learning_rate": 4.9584160413344636e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1145 }, { "completion_length": 475.25, "epoch": 0.31762749445676275, "grad_norm": 0.3195273280143738, "kl": 0.0365711934864521, "learning_rate": 4.9583364863024645e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1146 }, { "completion_length": 529.25, "epoch": 0.3179046563192905, "grad_norm": 0.3030664622783661, "kl": 0.04061971232295036, "learning_rate": 4.9582568558835124e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1147 }, { "completion_length": 434.5, "epoch": 0.3181818181818182, "grad_norm": 0.41370856761932373, "kl": 0.03620387241244316, "learning_rate": 4.958177150080048e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1148 }, { "completion_length": 477.5, "epoch": 0.3184589800443459, "grad_norm": 0.3442181944847107, "kl": 0.04408401623368263, "learning_rate": 4.958097368894516e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1149 }, { "completion_length": 440.75, "epoch": 0.3187361419068736, "grad_norm": 0.3763028085231781, "kl": 0.039957500994205475, "learning_rate": 4.958017512329363e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1150 }, { "completion_length": 484.75, "epoch": 0.31901330376940135, "grad_norm": 0.38004690408706665, "kl": 0.03770674392580986, "learning_rate": 4.957937580387038e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1151 }, { "completion_length": 504.75, "epoch": 0.31929046563192903, "grad_norm": 0.33805757761001587, "kl": 0.04405752196907997, "learning_rate": 4.957857573069992e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1152 }, { "completion_length": 482.75, "epoch": 0.3195676274944568, "grad_norm": 0.37845373153686523, "kl": 0.03547224402427673, "learning_rate": 4.957777490380678e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1153 }, { "completion_length": 452.0, "epoch": 0.31984478935698446, "grad_norm": 0.34857606887817383, "kl": 0.05628936365246773, "learning_rate": 4.957697332321554e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1154 }, { "completion_length": 451.0, "epoch": 0.3201219512195122, "grad_norm": 0.34407860040664673, "kl": 0.042310815304517746, "learning_rate": 4.957617098895076e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1155 }, { "completion_length": 500.0, "epoch": 0.3203991130820399, "grad_norm": 0.3506558835506439, "kl": 0.035359375178813934, "learning_rate": 4.957536790103705e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1156 }, { "completion_length": 429.5, "epoch": 0.32067627494456763, "grad_norm": 0.3733195960521698, "kl": 0.05263623222708702, "learning_rate": 4.957456405949904e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1157 }, { "completion_length": 493.25, "epoch": 0.32095343680709537, "grad_norm": 0.33772388100624084, "kl": 0.04574166610836983, "learning_rate": 4.957375946436137e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1158 }, { "completion_length": 455.5, "epoch": 0.32123059866962306, "grad_norm": 0.0, "kl": 0.041672173887491226, "learning_rate": 4.957295411564873e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1159 }, { "completion_length": 498.5, "epoch": 0.3215077605321508, "grad_norm": 0.0, "kl": 0.047211360186338425, "learning_rate": 4.9572148013385815e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1160 }, { "completion_length": 499.75, "epoch": 0.3217849223946785, "grad_norm": 0.32244303822517395, "kl": 0.0575113371014595, "learning_rate": 4.9571341157597315e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1161 }, { "completion_length": 456.75, "epoch": 0.3220620842572062, "grad_norm": 0.32381513714790344, "kl": 0.04496585950255394, "learning_rate": 4.957053354830802e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1162 }, { "completion_length": 442.5, "epoch": 0.3223392461197339, "grad_norm": 0.45719805359840393, "kl": 0.06052461639046669, "learning_rate": 4.956972518554266e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1163 }, { "completion_length": 427.25, "epoch": 0.32261640798226165, "grad_norm": 0.33322468400001526, "kl": 0.04490192234516144, "learning_rate": 4.956891606932604e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1164 }, { "completion_length": 459.25, "epoch": 0.32289356984478934, "grad_norm": 0.3246477246284485, "kl": 0.037682272493839264, "learning_rate": 4.956810619968296e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1165 }, { "completion_length": 489.75, "epoch": 0.3231707317073171, "grad_norm": 0.36216503381729126, "kl": 0.04434441402554512, "learning_rate": 4.956729557663827e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1166 }, { "completion_length": 496.75, "epoch": 0.32344789356984477, "grad_norm": 0.4187166392803192, "kl": 0.0460393950343132, "learning_rate": 4.956648420021682e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1167 }, { "completion_length": 548.25, "epoch": 0.3237250554323725, "grad_norm": 0.35309234261512756, "kl": 0.059680499136447906, "learning_rate": 4.956567207044349e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1168 }, { "completion_length": 495.75, "epoch": 0.3240022172949002, "grad_norm": 0.3352111577987671, "kl": 0.05061076581478119, "learning_rate": 4.95648591873432e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1169 }, { "completion_length": 499.0, "epoch": 0.32427937915742794, "grad_norm": 0.310629278421402, "kl": 0.041773438453674316, "learning_rate": 4.956404555094086e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1170 }, { "completion_length": 493.0, "epoch": 0.3245565410199557, "grad_norm": 0.33218327164649963, "kl": 0.04769166558980942, "learning_rate": 4.9563231161261426e-06, "loss": -0.0, "reward": 2.0, "reward_std": 0.5, "rewards/confident_score_func": -0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1171 }, { "completion_length": 495.5, "epoch": 0.32483370288248337, "grad_norm": 0.32254505157470703, "kl": 0.06047322601079941, "learning_rate": 4.956241601832987e-06, "loss": 0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1172 }, { "completion_length": 416.5, "epoch": 0.3251108647450111, "grad_norm": 0.35028770565986633, "kl": 0.04183528572320938, "learning_rate": 4.956160012217119e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1173 }, { "completion_length": 434.0, "epoch": 0.3253880266075388, "grad_norm": 0.49332574009895325, "kl": 0.04423806443810463, "learning_rate": 4.956078347281042e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1174 }, { "completion_length": 399.25, "epoch": 0.32566518847006654, "grad_norm": 0.0, "kl": 0.05001906305551529, "learning_rate": 4.955996607027258e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1175 }, { "completion_length": 535.75, "epoch": 0.3259423503325942, "grad_norm": 0.31278154253959656, "kl": 0.05572007969021797, "learning_rate": 4.9559147914582756e-06, "loss": -0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1176 }, { "completion_length": 473.5, "epoch": 0.32621951219512196, "grad_norm": 0.368752658367157, "kl": 0.048525165766477585, "learning_rate": 4.955832900576603e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1177 }, { "completion_length": 476.25, "epoch": 0.32649667405764965, "grad_norm": 0.3616432249546051, "kl": 0.05032933130860329, "learning_rate": 4.95575093438475e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1178 }, { "completion_length": 408.5, "epoch": 0.3267738359201774, "grad_norm": 0.5071893930435181, "kl": 0.07715056091547012, "learning_rate": 4.9556688928852325e-06, "loss": -0.0, "reward": 2.5625, "reward_std": 1.625, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 1179 }, { "completion_length": 443.5, "epoch": 0.3270509977827051, "grad_norm": 0.3560357391834259, "kl": 0.04787587746977806, "learning_rate": 4.9555867760805655e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1180 }, { "completion_length": 546.0, "epoch": 0.3273281596452328, "grad_norm": 0.31635820865631104, "kl": 0.049242328852415085, "learning_rate": 4.955504583973266e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1181 }, { "completion_length": 526.25, "epoch": 0.3276053215077605, "grad_norm": 0.0, "kl": 0.03929964080452919, "learning_rate": 4.955422316565856e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1182 }, { "completion_length": 400.0, "epoch": 0.32788248337028825, "grad_norm": 0.42131316661834717, "kl": 0.051260679960250854, "learning_rate": 4.955339973860859e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1183 }, { "completion_length": 478.25, "epoch": 0.328159645232816, "grad_norm": 0.3642728924751282, "kl": 0.052802037447690964, "learning_rate": 4.955257555860798e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1184 }, { "completion_length": 482.0, "epoch": 0.3284368070953437, "grad_norm": 0.0, "kl": 0.03855329006910324, "learning_rate": 4.9551750625682015e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1185 }, { "completion_length": 427.0, "epoch": 0.3287139689578714, "grad_norm": 0.4507885277271271, "kl": 0.04415288195014, "learning_rate": 4.955092493985599e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1186 }, { "completion_length": 444.0, "epoch": 0.3289911308203991, "grad_norm": 0.3614337146282196, "kl": 0.046982016414403915, "learning_rate": 4.955009850115523e-06, "loss": -0.0, "reward": 4.625, "reward_std": 1.6520190238952637, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1187 }, { "completion_length": 561.0, "epoch": 0.32926829268292684, "grad_norm": 0.3489435315132141, "kl": 0.03863685950636864, "learning_rate": 4.954927130960507e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1188 }, { "completion_length": 488.0, "epoch": 0.32954545454545453, "grad_norm": 0.30323469638824463, "kl": 0.043695732951164246, "learning_rate": 4.954844336523089e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1189 }, { "completion_length": 422.5, "epoch": 0.32982261640798227, "grad_norm": 0.0, "kl": 0.048892512917518616, "learning_rate": 4.954761466805806e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1190 }, { "completion_length": 473.0, "epoch": 0.33009977827050996, "grad_norm": 0.0, "kl": 0.03439199924468994, "learning_rate": 4.954678521811201e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1191 }, { "completion_length": 510.5, "epoch": 0.3303769401330377, "grad_norm": 0.31067654490470886, "kl": 0.05122602730989456, "learning_rate": 4.954595501541817e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1192 }, { "completion_length": 443.25, "epoch": 0.3306541019955654, "grad_norm": 0.4069591462612152, "kl": 0.04841720312833786, "learning_rate": 4.954512406000199e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1193 }, { "completion_length": 423.75, "epoch": 0.3309312638580931, "grad_norm": 0.35489630699157715, "kl": 0.048874661326408386, "learning_rate": 4.954429235188897e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1194 }, { "completion_length": 473.5, "epoch": 0.33120842572062087, "grad_norm": 0.3569456934928894, "kl": 0.03203795477747917, "learning_rate": 4.95434598911046e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1195 }, { "completion_length": 407.0, "epoch": 0.33148558758314856, "grad_norm": 0.41339239478111267, "kl": 0.046092480421066284, "learning_rate": 4.954262667767442e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1196 }, { "completion_length": 472.25, "epoch": 0.3317627494456763, "grad_norm": 0.0, "kl": 0.049189768731594086, "learning_rate": 4.954179271162396e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1197 }, { "completion_length": 469.25, "epoch": 0.332039911308204, "grad_norm": 0.3270930349826813, "kl": 0.04310819134116173, "learning_rate": 4.954095799297882e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1198 }, { "completion_length": 433.0, "epoch": 0.3323170731707317, "grad_norm": 0.3484914302825928, "kl": 0.04856841266155243, "learning_rate": 4.9540122521764585e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1199 }, { "completion_length": 437.0, "epoch": 0.3325942350332594, "grad_norm": 0.0, "kl": 0.04057430848479271, "learning_rate": 4.953928629800688e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1200 }, { "completion_length": 435.0, "epoch": 0.33287139689578715, "grad_norm": 0.40965548157691956, "kl": 0.04365210235118866, "learning_rate": 4.953844932173134e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1201 }, { "completion_length": 417.25, "epoch": 0.33314855875831484, "grad_norm": 0.0, "kl": 0.04522743821144104, "learning_rate": 4.953761159296364e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1202 }, { "completion_length": 473.25, "epoch": 0.3334257206208426, "grad_norm": 0.6562620997428894, "kl": 0.04574869945645332, "learning_rate": 4.953677311172946e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1203 }, { "completion_length": 440.0, "epoch": 0.33370288248337027, "grad_norm": 0.32108864188194275, "kl": 0.043894585222005844, "learning_rate": 4.953593387805453e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1204 }, { "completion_length": 458.75, "epoch": 0.333980044345898, "grad_norm": 0.3207654058933258, "kl": 0.0430169478058815, "learning_rate": 4.953509389196457e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1205 }, { "completion_length": 510.0, "epoch": 0.3342572062084257, "grad_norm": 0.3113867938518524, "kl": 0.06477995961904526, "learning_rate": 4.953425315348534e-06, "loss": 0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1206 }, { "completion_length": 507.75, "epoch": 0.33453436807095344, "grad_norm": 0.343845933675766, "kl": 0.04756207764148712, "learning_rate": 4.9533411662642625e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1207 }, { "completion_length": 517.75, "epoch": 0.3348115299334812, "grad_norm": 0.33315718173980713, "kl": 0.04020114988088608, "learning_rate": 4.953256941946224e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1208 }, { "completion_length": 523.75, "epoch": 0.33508869179600886, "grad_norm": 0.3359604477882385, "kl": 0.05294032394886017, "learning_rate": 4.953172642396999e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1209 }, { "completion_length": 499.0, "epoch": 0.3353658536585366, "grad_norm": 0.399522602558136, "kl": 0.048592012375593185, "learning_rate": 4.953088267619175e-06, "loss": 0.0, "reward": 3.0, "reward_std": 1.8484227657318115, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1210 }, { "completion_length": 460.25, "epoch": 0.3356430155210643, "grad_norm": 0.37183481454849243, "kl": 0.08395577222108841, "learning_rate": 4.9530038176153385e-06, "loss": 0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1211 }, { "completion_length": 385.75, "epoch": 0.33592017738359203, "grad_norm": 0.0, "kl": 0.04572058841586113, "learning_rate": 4.952919292388079e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1212 }, { "completion_length": 419.5, "epoch": 0.3361973392461197, "grad_norm": 0.3473335802555084, "kl": 0.04448418319225311, "learning_rate": 4.952834691939988e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1213 }, { "completion_length": 421.25, "epoch": 0.33647450110864746, "grad_norm": 0.304731160402298, "kl": 0.038719020783901215, "learning_rate": 4.952750016273662e-06, "loss": -0.0, "reward": 4.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1214 }, { "completion_length": 403.5, "epoch": 0.33675166297117515, "grad_norm": 0.4113273322582245, "kl": 0.06110246106982231, "learning_rate": 4.952665265391695e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1215 }, { "completion_length": 477.75, "epoch": 0.3370288248337029, "grad_norm": 0.3652150332927704, "kl": 0.03647322207689285, "learning_rate": 4.952580439296687e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1216 }, { "completion_length": 419.0, "epoch": 0.3373059866962306, "grad_norm": 0.3646344542503357, "kl": 0.03847577050328255, "learning_rate": 4.95249553799124e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1217 }, { "completion_length": 421.5, "epoch": 0.3375831485587583, "grad_norm": 0.30988407135009766, "kl": 0.04268668591976166, "learning_rate": 4.952410561477957e-06, "loss": 0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1218 }, { "completion_length": 462.5, "epoch": 0.33786031042128606, "grad_norm": 0.4540899097919464, "kl": 0.03870726376771927, "learning_rate": 4.952325509759444e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1219 }, { "completion_length": 402.75, "epoch": 0.33813747228381374, "grad_norm": 0.3733746409416199, "kl": 0.04570271819829941, "learning_rate": 4.9522403828383085e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1220 }, { "completion_length": 435.75, "epoch": 0.3384146341463415, "grad_norm": 0.4037312865257263, "kl": 0.04795628413558006, "learning_rate": 4.952155180717161e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1221 }, { "completion_length": 384.5, "epoch": 0.3386917960088692, "grad_norm": 0.4268009066581726, "kl": 0.07063834369182587, "learning_rate": 4.9520699033986164e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1222 }, { "completion_length": 530.5, "epoch": 0.3389689578713969, "grad_norm": 0.3304324746131897, "kl": 0.05012734234333038, "learning_rate": 4.951984550885287e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1223 }, { "completion_length": 407.75, "epoch": 0.3392461197339246, "grad_norm": 0.5126408934593201, "kl": 0.08795095235109329, "learning_rate": 4.951899123179792e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1224 }, { "completion_length": 418.75, "epoch": 0.33952328159645234, "grad_norm": 0.3530424237251282, "kl": 0.048837993294000626, "learning_rate": 4.95181362028475e-06, "loss": 0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1225 }, { "completion_length": 537.5, "epoch": 0.33980044345898003, "grad_norm": 0.3318172097206116, "kl": 0.0344761498272419, "learning_rate": 4.951728042202784e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1226 }, { "completion_length": 480.25, "epoch": 0.34007760532150777, "grad_norm": 0.3655102849006653, "kl": 0.044387735426425934, "learning_rate": 4.951642388936518e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1227 }, { "completion_length": 408.75, "epoch": 0.34035476718403546, "grad_norm": 0.4363381862640381, "kl": 0.07234556972980499, "learning_rate": 4.951556660488578e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1228 }, { "completion_length": 428.25, "epoch": 0.3406319290465632, "grad_norm": 0.0, "kl": 0.04318170249462128, "learning_rate": 4.9514708568615935e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1229 }, { "completion_length": 435.5, "epoch": 0.3409090909090909, "grad_norm": 0.4276098906993866, "kl": 0.04116055741906166, "learning_rate": 4.951384978058196e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1230 }, { "completion_length": 511.5, "epoch": 0.3411862527716186, "grad_norm": 0.36302265524864197, "kl": 0.048705413937568665, "learning_rate": 4.951299024081019e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1231 }, { "completion_length": 357.75, "epoch": 0.34146341463414637, "grad_norm": 0.0, "kl": 0.05431542545557022, "learning_rate": 4.951212994932697e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1232 }, { "completion_length": 428.5, "epoch": 0.34174057649667405, "grad_norm": 0.37243685126304626, "kl": 0.045504964888095856, "learning_rate": 4.951126890615871e-06, "loss": 0.0, "reward": 3.25, "reward_std": 2.3804759979248047, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1233 }, { "completion_length": 449.0, "epoch": 0.3420177383592018, "grad_norm": 0.3737602233886719, "kl": 0.0485365055501461, "learning_rate": 4.951040711133178e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1234 }, { "completion_length": 418.5, "epoch": 0.3422949002217295, "grad_norm": 0.34066733717918396, "kl": 0.057449821382761, "learning_rate": 4.950954456487264e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1235 }, { "completion_length": 427.75, "epoch": 0.3425720620842572, "grad_norm": 0.3848603367805481, "kl": 0.0501105859875679, "learning_rate": 4.9508681266807715e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1236 }, { "completion_length": 468.5, "epoch": 0.3428492239467849, "grad_norm": 0.3811749517917633, "kl": 0.04116233438253403, "learning_rate": 4.950781721716349e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1237 }, { "completion_length": 395.25, "epoch": 0.34312638580931265, "grad_norm": 0.3977622985839844, "kl": 0.03642331063747406, "learning_rate": 4.950695241596646e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1238 }, { "completion_length": 401.5, "epoch": 0.34340354767184034, "grad_norm": 0.4199325442314148, "kl": 0.04215638339519501, "learning_rate": 4.9506086863243146e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1239 }, { "completion_length": 409.0, "epoch": 0.3436807095343681, "grad_norm": 0.37428539991378784, "kl": 0.06504176557064056, "learning_rate": 4.950522055902009e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1240 }, { "completion_length": 452.25, "epoch": 0.34395787139689576, "grad_norm": 0.0, "kl": 0.04085565730929375, "learning_rate": 4.9504353503323865e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1241 }, { "completion_length": 472.75, "epoch": 0.3442350332594235, "grad_norm": 0.3516615331172943, "kl": 0.044558554887771606, "learning_rate": 4.950348569618105e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1242 }, { "completion_length": 415.5, "epoch": 0.3445121951219512, "grad_norm": 0.35607996582984924, "kl": 0.050577662885189056, "learning_rate": 4.950261713761826e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1243 }, { "completion_length": 494.25, "epoch": 0.34478935698447893, "grad_norm": 0.36531907320022583, "kl": 0.03721573203802109, "learning_rate": 4.950174782766213e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1244 }, { "completion_length": 456.0, "epoch": 0.3450665188470067, "grad_norm": 0.37236568331718445, "kl": 0.041248224675655365, "learning_rate": 4.950087776633931e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1245 }, { "completion_length": 432.0, "epoch": 0.34534368070953436, "grad_norm": 0.4164941608905792, "kl": 0.053054843097925186, "learning_rate": 4.95000069536765e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1246 }, { "completion_length": 416.75, "epoch": 0.3456208425720621, "grad_norm": 0.3846975266933441, "kl": 0.05875835195183754, "learning_rate": 4.94991353897004e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1247 }, { "completion_length": 413.5, "epoch": 0.3458980044345898, "grad_norm": 0.3788717985153198, "kl": 0.05509291589260101, "learning_rate": 4.949826307443773e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1248 }, { "completion_length": 480.0, "epoch": 0.34617516629711753, "grad_norm": 0.31107738614082336, "kl": 0.05806124955415726, "learning_rate": 4.949739000791524e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1249 }, { "completion_length": 534.25, "epoch": 0.3464523281596452, "grad_norm": 0.0, "kl": 0.07736926525831223, "learning_rate": 4.94965161901597e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1250 }, { "completion_length": 499.0, "epoch": 0.34672949002217296, "grad_norm": 0.36896926164627075, "kl": 0.039118874818086624, "learning_rate": 4.949564162119791e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1251 }, { "completion_length": 478.0, "epoch": 0.34700665188470065, "grad_norm": 0.4078272879123688, "kl": 0.04798232391476631, "learning_rate": 4.94947663010567e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1252 }, { "completion_length": 431.5, "epoch": 0.3472838137472284, "grad_norm": 0.4506378769874573, "kl": 0.09373219311237335, "learning_rate": 4.94938902297629e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1253 }, { "completion_length": 473.0, "epoch": 0.3475609756097561, "grad_norm": 0.0, "kl": 0.05714603513479233, "learning_rate": 4.949301340734337e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1254 }, { "completion_length": 451.75, "epoch": 0.3478381374722838, "grad_norm": 0.0, "kl": 0.051898043602705, "learning_rate": 4.949213583382502e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1255 }, { "completion_length": 488.5, "epoch": 0.34811529933481156, "grad_norm": 0.4317739009857178, "kl": 0.08319830149412155, "learning_rate": 4.949125750923474e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1256 }, { "completion_length": 467.0, "epoch": 0.34839246119733924, "grad_norm": 1.3655380010604858, "kl": 0.05374177545309067, "learning_rate": 4.949037843359947e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1257 }, { "completion_length": 432.5, "epoch": 0.348669623059867, "grad_norm": 0.0, "kl": 0.05436611548066139, "learning_rate": 4.948949860694618e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1258 }, { "completion_length": 439.5, "epoch": 0.34894678492239467, "grad_norm": 0.0, "kl": 0.056315332651138306, "learning_rate": 4.948861802930184e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1259 }, { "completion_length": 400.5, "epoch": 0.3492239467849224, "grad_norm": 0.47050055861473083, "kl": 0.04908331111073494, "learning_rate": 4.948773670069345e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1260 }, { "completion_length": 502.0, "epoch": 0.3495011086474501, "grad_norm": 0.0, "kl": 0.04938698932528496, "learning_rate": 4.948685462114805e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1261 }, { "completion_length": 493.5, "epoch": 0.34977827050997784, "grad_norm": 0.43169739842414856, "kl": 0.04670584946870804, "learning_rate": 4.948597179069267e-06, "loss": -0.0, "reward": 2.65625, "reward_std": 2.0700619220733643, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.65625, "step": 1262 }, { "completion_length": 463.25, "epoch": 0.3500554323725055, "grad_norm": 0.37910449504852295, "kl": 0.07263357937335968, "learning_rate": 4.948508820935441e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1263 }, { "completion_length": 431.5, "epoch": 0.35033259423503327, "grad_norm": 0.3850328028202057, "kl": 0.054650869220495224, "learning_rate": 4.948420387716034e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1264 }, { "completion_length": 456.25, "epoch": 0.35060975609756095, "grad_norm": 0.3417406976222992, "kl": 0.05569911375641823, "learning_rate": 4.9483318794137585e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1265 }, { "completion_length": 404.75, "epoch": 0.3508869179600887, "grad_norm": 0.3868523836135864, "kl": 0.05233040824532509, "learning_rate": 4.94824329603133e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1266 }, { "completion_length": 495.0, "epoch": 0.3511640798226164, "grad_norm": 0.37257590889930725, "kl": 0.05010383203625679, "learning_rate": 4.948154637571463e-06, "loss": 0.0, "reward": 2.0, "reward_std": 0.28867512941360474, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1267 }, { "completion_length": 521.25, "epoch": 0.3514412416851441, "grad_norm": 0.4233439564704895, "kl": 0.06150028482079506, "learning_rate": 4.948065904036878e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1268 }, { "completion_length": 463.25, "epoch": 0.35171840354767187, "grad_norm": 0.33747413754463196, "kl": 0.05899282172322273, "learning_rate": 4.947977095430296e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1269 }, { "completion_length": 560.0, "epoch": 0.35199556541019955, "grad_norm": 0.0, "kl": 0.04899216070771217, "learning_rate": 4.947888211754439e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1270 }, { "completion_length": 442.75, "epoch": 0.3522727272727273, "grad_norm": 0.5379586219787598, "kl": 0.059618450701236725, "learning_rate": 4.947799253012033e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1271 }, { "completion_length": 452.25, "epoch": 0.352549889135255, "grad_norm": 0.3670327663421631, "kl": 0.05775253847241402, "learning_rate": 4.947710219205808e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1272 }, { "completion_length": 427.0, "epoch": 0.3528270509977827, "grad_norm": 0.0, "kl": 0.04804341495037079, "learning_rate": 4.947621110338492e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1273 }, { "completion_length": 418.25, "epoch": 0.3531042128603104, "grad_norm": 0.3189499080181122, "kl": 0.12233645468950272, "learning_rate": 4.947531926412818e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1274 }, { "completion_length": 489.25, "epoch": 0.35338137472283815, "grad_norm": 0.36922571063041687, "kl": 0.05239514634013176, "learning_rate": 4.947442667431522e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1275 }, { "completion_length": 479.25, "epoch": 0.35365853658536583, "grad_norm": 0.33570247888565063, "kl": 0.04722484573721886, "learning_rate": 4.94735333339734e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1276 }, { "completion_length": 475.25, "epoch": 0.3539356984478936, "grad_norm": 0.4119182229042053, "kl": 0.052285097539424896, "learning_rate": 4.947263924313012e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1277 }, { "completion_length": 498.0, "epoch": 0.35421286031042126, "grad_norm": 0.3219665586948395, "kl": 0.04703151062130928, "learning_rate": 4.947174440181281e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1278 }, { "completion_length": 430.0, "epoch": 0.354490022172949, "grad_norm": 0.4541938304901123, "kl": 0.05250055342912674, "learning_rate": 4.9470848810048875e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1279 }, { "completion_length": 512.25, "epoch": 0.35476718403547675, "grad_norm": 0.3707892596721649, "kl": 0.05769985914230347, "learning_rate": 4.946995246786582e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1280 }, { "completion_length": 438.0, "epoch": 0.35504434589800443, "grad_norm": 0.5145325064659119, "kl": 0.06438126415014267, "learning_rate": 4.946905537529111e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1281 }, { "completion_length": 436.0, "epoch": 0.3553215077605322, "grad_norm": 0.31930768489837646, "kl": 0.05212489515542984, "learning_rate": 4.946815753235227e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1282 }, { "completion_length": 512.25, "epoch": 0.35559866962305986, "grad_norm": 0.29391607642173767, "kl": 0.047975823283195496, "learning_rate": 4.946725893907682e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1283 }, { "completion_length": 442.25, "epoch": 0.3558758314855876, "grad_norm": 0.4375475347042084, "kl": 0.045368365943431854, "learning_rate": 4.9466359595492305e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1284 }, { "completion_length": 471.0, "epoch": 0.3561529933481153, "grad_norm": 0.0, "kl": 0.04465801268815994, "learning_rate": 4.946545950162634e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1285 }, { "completion_length": 523.75, "epoch": 0.35643015521064303, "grad_norm": 0.0, "kl": 0.0492875762283802, "learning_rate": 4.946455865750649e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1286 }, { "completion_length": 389.0, "epoch": 0.3567073170731707, "grad_norm": 0.42741137742996216, "kl": 0.06341007351875305, "learning_rate": 4.94636570631604e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1287 }, { "completion_length": 483.75, "epoch": 0.35698447893569846, "grad_norm": 0.35286563634872437, "kl": 0.05766860023140907, "learning_rate": 4.9462754718615715e-06, "loss": 0.0, "reward": 2.5625, "reward_std": 2.1542110443115234, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 1288 }, { "completion_length": 439.0, "epoch": 0.35726164079822614, "grad_norm": 0.36923959851264954, "kl": 0.052232805639505386, "learning_rate": 4.946185162390011e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1289 }, { "completion_length": 448.75, "epoch": 0.3575388026607539, "grad_norm": 0.36421459913253784, "kl": 0.05488550290465355, "learning_rate": 4.946094777904127e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1290 }, { "completion_length": 441.5, "epoch": 0.35781596452328157, "grad_norm": 0.4626808166503906, "kl": 0.048189450055360794, "learning_rate": 4.946004318406692e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1291 }, { "completion_length": 501.5, "epoch": 0.3580931263858093, "grad_norm": 0.42165207862854004, "kl": 0.04486449062824249, "learning_rate": 4.945913783900479e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1292 }, { "completion_length": 473.75, "epoch": 0.35837028824833705, "grad_norm": 0.37699347734451294, "kl": 0.045615870505571365, "learning_rate": 4.945823174388265e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1293 }, { "completion_length": 491.75, "epoch": 0.35864745011086474, "grad_norm": 0.34931501746177673, "kl": 0.054333530366420746, "learning_rate": 4.94573248987283e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1294 }, { "completion_length": 497.25, "epoch": 0.3589246119733925, "grad_norm": 0.3917674720287323, "kl": 0.038699980825185776, "learning_rate": 4.945641730356952e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1295 }, { "completion_length": 464.0, "epoch": 0.35920177383592017, "grad_norm": 0.36365967988967896, "kl": 0.0589275136590004, "learning_rate": 4.945550895843416e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1296 }, { "completion_length": 479.0, "epoch": 0.3594789356984479, "grad_norm": 0.5401272773742676, "kl": 0.06769148260354996, "learning_rate": 4.945459986335008e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1297 }, { "completion_length": 428.5, "epoch": 0.3597560975609756, "grad_norm": 0.0, "kl": 0.07261089235544205, "learning_rate": 4.9453690018345144e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1298 }, { "completion_length": 521.0, "epoch": 0.36003325942350334, "grad_norm": 0.3022534251213074, "kl": 0.06482867151498795, "learning_rate": 4.945277942344726e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1299 }, { "completion_length": 543.25, "epoch": 0.360310421286031, "grad_norm": 0.32647934556007385, "kl": 0.04709937423467636, "learning_rate": 4.945186807868436e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1300 }, { "completion_length": 437.25, "epoch": 0.36058758314855877, "grad_norm": 0.3858109414577484, "kl": 0.05258268117904663, "learning_rate": 4.945095598408437e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.7320507764816284, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1301 }, { "completion_length": 477.75, "epoch": 0.36086474501108645, "grad_norm": 0.3460140526294708, "kl": 0.07427533715963364, "learning_rate": 4.9450043139675284e-06, "loss": -0.0, "reward": 3.625, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1302 }, { "completion_length": 468.5, "epoch": 0.3611419068736142, "grad_norm": 0.3769211769104004, "kl": 0.06491927802562714, "learning_rate": 4.944912954548509e-06, "loss": -0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1303 }, { "completion_length": 497.25, "epoch": 0.3614190687361419, "grad_norm": 0.3783934414386749, "kl": 0.07449346035718918, "learning_rate": 4.944821520154178e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1304 }, { "completion_length": 509.0, "epoch": 0.3616962305986696, "grad_norm": 0.34510326385498047, "kl": 0.05845344811677933, "learning_rate": 4.944730010787343e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1305 }, { "completion_length": 597.5, "epoch": 0.36197339246119736, "grad_norm": 0.36775854229927063, "kl": 0.04278390109539032, "learning_rate": 4.944638426450807e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1306 }, { "completion_length": 443.75, "epoch": 0.36225055432372505, "grad_norm": 0.3921583592891693, "kl": 0.06333759427070618, "learning_rate": 4.944546767147381e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1307 }, { "completion_length": 459.0, "epoch": 0.3625277161862528, "grad_norm": 0.43135735392570496, "kl": 0.04580548033118248, "learning_rate": 4.944455032879874e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1308 }, { "completion_length": 521.5, "epoch": 0.3628048780487805, "grad_norm": 0.0, "kl": 0.06464941054582596, "learning_rate": 4.9443632236511e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1309 }, { "completion_length": 452.0, "epoch": 0.3630820399113082, "grad_norm": 0.4050907492637634, "kl": 0.07247640192508698, "learning_rate": 4.944271339463874e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1310 }, { "completion_length": 512.5, "epoch": 0.3633592017738359, "grad_norm": 0.0, "kl": 0.06783079355955124, "learning_rate": 4.944179380321015e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1311 }, { "completion_length": 547.0, "epoch": 0.36363636363636365, "grad_norm": 0.34115299582481384, "kl": 0.06506805866956711, "learning_rate": 4.944087346225341e-06, "loss": -0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1312 }, { "completion_length": 489.75, "epoch": 0.36391352549889133, "grad_norm": 0.3357463777065277, "kl": 0.04570919647812843, "learning_rate": 4.943995237179675e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1313 }, { "completion_length": 530.5, "epoch": 0.3641906873614191, "grad_norm": 0.38165098428726196, "kl": 0.058215025812387466, "learning_rate": 4.943903053186843e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1314 }, { "completion_length": 490.5, "epoch": 0.36446784922394676, "grad_norm": 0.37656813859939575, "kl": 0.0812653973698616, "learning_rate": 4.94381079424967e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1315 }, { "completion_length": 458.75, "epoch": 0.3647450110864745, "grad_norm": 0.0, "kl": 0.06585890054702759, "learning_rate": 4.9437184603709854e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1316 }, { "completion_length": 526.25, "epoch": 0.36502217294900224, "grad_norm": 0.4285425841808319, "kl": 0.06273116171360016, "learning_rate": 4.943626051553622e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1317 }, { "completion_length": 432.25, "epoch": 0.36529933481152993, "grad_norm": 0.0, "kl": 0.06133902445435524, "learning_rate": 4.943533567800413e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1318 }, { "completion_length": 566.5, "epoch": 0.36557649667405767, "grad_norm": 0.0, "kl": 0.045452017337083817, "learning_rate": 4.9434410091141936e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1319 }, { "completion_length": 455.0, "epoch": 0.36585365853658536, "grad_norm": 0.0, "kl": 0.08384886384010315, "learning_rate": 4.9433483754978025e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1320 }, { "completion_length": 405.25, "epoch": 0.3661308203991131, "grad_norm": 0.449907124042511, "kl": 0.060961659997701645, "learning_rate": 4.943255666954082e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1321 }, { "completion_length": 457.25, "epoch": 0.3664079822616408, "grad_norm": 0.0, "kl": 0.06532251834869385, "learning_rate": 4.943162883485874e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1322 }, { "completion_length": 557.75, "epoch": 0.3666851441241685, "grad_norm": 0.38457372784614563, "kl": 0.056872207671403885, "learning_rate": 4.943070025096022e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1323 }, { "completion_length": 506.0, "epoch": 0.3669623059866962, "grad_norm": 0.3790721297264099, "kl": 0.10622705519199371, "learning_rate": 4.942977091787376e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1324 }, { "completion_length": 490.0, "epoch": 0.36723946784922396, "grad_norm": 0.37324535846710205, "kl": 0.05665488913655281, "learning_rate": 4.942884083562786e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1325 }, { "completion_length": 488.0, "epoch": 0.36751662971175164, "grad_norm": 0.35582780838012695, "kl": 0.049306631088256836, "learning_rate": 4.942791000425103e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1326 }, { "completion_length": 479.25, "epoch": 0.3677937915742794, "grad_norm": 0.0, "kl": 0.057639554142951965, "learning_rate": 4.942697842377181e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1327 }, { "completion_length": 434.75, "epoch": 0.36807095343680707, "grad_norm": 0.0, "kl": 0.0663040429353714, "learning_rate": 4.942604609421878e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1328 }, { "completion_length": 470.25, "epoch": 0.3683481152993348, "grad_norm": 0.3663565516471863, "kl": 0.05199427902698517, "learning_rate": 4.942511301562053e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1329 }, { "completion_length": 569.5, "epoch": 0.36862527716186255, "grad_norm": 0.32181409001350403, "kl": 0.042057156562805176, "learning_rate": 4.9424179188005676e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1330 }, { "completion_length": 568.25, "epoch": 0.36890243902439024, "grad_norm": 0.27534401416778564, "kl": 0.04810820892453194, "learning_rate": 4.942324461140283e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1331 }, { "completion_length": 500.25, "epoch": 0.369179600886918, "grad_norm": 0.4531612992286682, "kl": 0.05054739862680435, "learning_rate": 4.9422309285840684e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1332 }, { "completion_length": 596.75, "epoch": 0.36945676274944567, "grad_norm": 0.3311308026313782, "kl": 0.04571700468659401, "learning_rate": 4.94213732113479e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1333 }, { "completion_length": 500.0, "epoch": 0.3697339246119734, "grad_norm": 0.3557955026626587, "kl": 0.052544400095939636, "learning_rate": 4.942043638795319e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1334 }, { "completion_length": 432.25, "epoch": 0.3700110864745011, "grad_norm": 0.0, "kl": 0.061962228268384933, "learning_rate": 4.941949881568529e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1335 }, { "completion_length": 420.75, "epoch": 0.37028824833702884, "grad_norm": 0.4408738613128662, "kl": 0.0647159144282341, "learning_rate": 4.941856049457293e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1336 }, { "completion_length": 451.75, "epoch": 0.3705654101995565, "grad_norm": 0.3485625088214874, "kl": 0.07582785934209824, "learning_rate": 4.941762142464491e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1337 }, { "completion_length": 480.5, "epoch": 0.37084257206208426, "grad_norm": 0.3784251809120178, "kl": 0.061238013207912445, "learning_rate": 4.941668160593e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1338 }, { "completion_length": 643.75, "epoch": 0.37111973392461195, "grad_norm": 0.38537877798080444, "kl": 0.049220647662878036, "learning_rate": 4.941574103845706e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1339 }, { "completion_length": 504.75, "epoch": 0.3713968957871397, "grad_norm": 0.3506568968296051, "kl": 0.04773741215467453, "learning_rate": 4.941479972225489e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1340 }, { "completion_length": 471.25, "epoch": 0.37167405764966743, "grad_norm": 0.3626452684402466, "kl": 0.06561433523893356, "learning_rate": 4.941385765735239e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1341 }, { "completion_length": 534.5, "epoch": 0.3719512195121951, "grad_norm": 0.4647257924079895, "kl": 0.0514705628156662, "learning_rate": 4.941291484377842e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1342 }, { "completion_length": 509.5, "epoch": 0.37222838137472286, "grad_norm": 0.0, "kl": 0.05371285229921341, "learning_rate": 4.941197128156192e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1343 }, { "completion_length": 489.25, "epoch": 0.37250554323725055, "grad_norm": 0.3751382529735565, "kl": 0.057587362825870514, "learning_rate": 4.941102697073181e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1344 }, { "completion_length": 439.5, "epoch": 0.3727827050997783, "grad_norm": 0.4211757183074951, "kl": 0.051148708909749985, "learning_rate": 4.941008191131705e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1345 }, { "completion_length": 561.0, "epoch": 0.373059866962306, "grad_norm": 0.2932257354259491, "kl": 0.03730163723230362, "learning_rate": 4.9409136103346615e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1346 }, { "completion_length": 582.5, "epoch": 0.3733370288248337, "grad_norm": 0.31013023853302, "kl": 0.0623408667743206, "learning_rate": 4.940818954684952e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1347 }, { "completion_length": 426.75, "epoch": 0.3736141906873614, "grad_norm": 0.0, "kl": 0.06855515390634537, "learning_rate": 4.940724224185478e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1348 }, { "completion_length": 502.25, "epoch": 0.37389135254988914, "grad_norm": 0.0, "kl": 0.058812279254198074, "learning_rate": 4.940629418839146e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1349 }, { "completion_length": 511.75, "epoch": 0.37416851441241683, "grad_norm": 0.33281219005584717, "kl": 0.0555361770093441, "learning_rate": 4.940534538648862e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1350 }, { "completion_length": 513.75, "epoch": 0.37444567627494457, "grad_norm": 0.3385156989097595, "kl": 0.04946306720376015, "learning_rate": 4.9404395836175364e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1351 }, { "completion_length": 575.25, "epoch": 0.37472283813747226, "grad_norm": 0.38172122836112976, "kl": 0.05423729121685028, "learning_rate": 4.94034455374808e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1352 }, { "completion_length": 450.5, "epoch": 0.375, "grad_norm": 0.4247133135795593, "kl": 0.08654285967350006, "learning_rate": 4.940249449043409e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1353 }, { "completion_length": 478.0, "epoch": 0.37527716186252774, "grad_norm": 0.36524900794029236, "kl": 0.04832373932003975, "learning_rate": 4.940154269506437e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1354 }, { "completion_length": 447.5, "epoch": 0.37555432372505543, "grad_norm": 0.38801613450050354, "kl": 0.057437196373939514, "learning_rate": 4.940059015140086e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1355 }, { "completion_length": 450.5, "epoch": 0.37583148558758317, "grad_norm": 0.0, "kl": 0.05531855672597885, "learning_rate": 4.939963685947275e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1356 }, { "completion_length": 429.5, "epoch": 0.37610864745011086, "grad_norm": 0.43914365768432617, "kl": 0.07246831059455872, "learning_rate": 4.939868281930928e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1357 }, { "completion_length": 468.25, "epoch": 0.3763858093126386, "grad_norm": 0.33803442120552063, "kl": 0.07132550328969955, "learning_rate": 4.939772803093969e-06, "loss": -0.0, "reward": 1.6875, "reward_std": 0.5153881907463074, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 1358 }, { "completion_length": 464.75, "epoch": 0.3766629711751663, "grad_norm": 0.3608713448047638, "kl": 0.04704250022768974, "learning_rate": 4.939677249439328e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1359 }, { "completion_length": 528.5, "epoch": 0.376940133037694, "grad_norm": 0.3604702651500702, "kl": 0.049141619354486465, "learning_rate": 4.939581620969935e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1360 }, { "completion_length": 509.0, "epoch": 0.3772172949002217, "grad_norm": 0.32625022530555725, "kl": 0.043810028582811356, "learning_rate": 4.939485917688722e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1361 }, { "completion_length": 530.25, "epoch": 0.37749445676274945, "grad_norm": 0.4462412893772125, "kl": 0.056824665516614914, "learning_rate": 4.939390139598623e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1362 }, { "completion_length": 502.25, "epoch": 0.37777161862527714, "grad_norm": 0.36889946460723877, "kl": 0.06524327397346497, "learning_rate": 4.9392942867025775e-06, "loss": -0.0, "reward": 2.5, "reward_std": 1.5, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1363 }, { "completion_length": 502.5, "epoch": 0.3780487804878049, "grad_norm": 0.35634350776672363, "kl": 0.05932406336069107, "learning_rate": 4.939198359003522e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1364 }, { "completion_length": 489.75, "epoch": 0.37832594235033257, "grad_norm": 0.0, "kl": 0.05285346135497093, "learning_rate": 4.939102356504401e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1365 }, { "completion_length": 494.25, "epoch": 0.3786031042128603, "grad_norm": 0.0, "kl": 0.04920567199587822, "learning_rate": 4.939006279208156e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1366 }, { "completion_length": 544.5, "epoch": 0.37888026607538805, "grad_norm": 0.5259163975715637, "kl": 0.05741789937019348, "learning_rate": 4.938910127117735e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1367 }, { "completion_length": 502.0, "epoch": 0.37915742793791574, "grad_norm": 0.29953861236572266, "kl": 0.05528295785188675, "learning_rate": 4.938813900236086e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1368 }, { "completion_length": 434.0, "epoch": 0.3794345898004435, "grad_norm": 0.3834942579269409, "kl": 0.06352014094591141, "learning_rate": 4.93871759856616e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1369 }, { "completion_length": 521.0, "epoch": 0.37971175166297116, "grad_norm": 0.35198143124580383, "kl": 0.04548854008316994, "learning_rate": 4.938621222110909e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1370 }, { "completion_length": 487.25, "epoch": 0.3799889135254989, "grad_norm": 0.3453386425971985, "kl": 0.048599500209093094, "learning_rate": 4.938524770873291e-06, "loss": -0.0, "reward": 4.75, "reward_std": 1.4142135381698608, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1371 }, { "completion_length": 473.75, "epoch": 0.3802660753880266, "grad_norm": 0.3043130338191986, "kl": 0.045969799160957336, "learning_rate": 4.938428244856262e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1372 }, { "completion_length": 449.75, "epoch": 0.38054323725055433, "grad_norm": 0.3861677944660187, "kl": 0.058848652988672256, "learning_rate": 4.938331644062782e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1373 }, { "completion_length": 472.25, "epoch": 0.380820399113082, "grad_norm": 0.4477904736995697, "kl": 0.056084029376506805, "learning_rate": 4.9382349684958135e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1374 }, { "completion_length": 483.0, "epoch": 0.38109756097560976, "grad_norm": 0.38648277521133423, "kl": 0.06156277656555176, "learning_rate": 4.938138218158321e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1375 }, { "completion_length": 468.25, "epoch": 0.38137472283813745, "grad_norm": 0.0, "kl": 0.055163703858852386, "learning_rate": 4.938041393053273e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1376 }, { "completion_length": 476.5, "epoch": 0.3816518847006652, "grad_norm": 0.3508663475513458, "kl": 0.04883846268057823, "learning_rate": 4.937944493183637e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1377 }, { "completion_length": 516.25, "epoch": 0.38192904656319293, "grad_norm": 0.3286697566509247, "kl": 0.059955958276987076, "learning_rate": 4.937847518552384e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1378 }, { "completion_length": 477.5, "epoch": 0.3822062084257206, "grad_norm": 0.45182135701179504, "kl": 0.053686436265707016, "learning_rate": 4.93775046916249e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1379 }, { "completion_length": 506.25, "epoch": 0.38248337028824836, "grad_norm": 0.0, "kl": 0.057031985372304916, "learning_rate": 4.9376533450169295e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1380 }, { "completion_length": 539.5, "epoch": 0.38276053215077604, "grad_norm": 0.0, "kl": 0.04983818903565407, "learning_rate": 4.937556146118682e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1381 }, { "completion_length": 425.25, "epoch": 0.3830376940133038, "grad_norm": 0.0, "kl": 0.06531424820423126, "learning_rate": 4.937458872470726e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1382 }, { "completion_length": 464.25, "epoch": 0.3833148558758315, "grad_norm": 0.0, "kl": 0.059294652193784714, "learning_rate": 4.937361524076047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1383 }, { "completion_length": 495.5, "epoch": 0.3835920177383592, "grad_norm": 0.0, "kl": 0.0512576587498188, "learning_rate": 4.937264100937629e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1384 }, { "completion_length": 458.5, "epoch": 0.3838691796008869, "grad_norm": 0.3883955478668213, "kl": 0.06945037841796875, "learning_rate": 4.93716660305846e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1385 }, { "completion_length": 490.0, "epoch": 0.38414634146341464, "grad_norm": 0.43745672702789307, "kl": 0.051635220646858215, "learning_rate": 4.937069030441529e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1386 }, { "completion_length": 570.5, "epoch": 0.38442350332594233, "grad_norm": 0.3147524893283844, "kl": 0.05954831466078758, "learning_rate": 4.936971383089829e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1387 }, { "completion_length": 451.25, "epoch": 0.38470066518847007, "grad_norm": 0.0, "kl": 0.06611743569374084, "learning_rate": 4.936873661006355e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1388 }, { "completion_length": 495.25, "epoch": 0.38497782705099776, "grad_norm": 0.3813982307910919, "kl": 0.05742989107966423, "learning_rate": 4.936775864194101e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1389 }, { "completion_length": 495.0, "epoch": 0.3852549889135255, "grad_norm": 0.0, "kl": 0.04572109505534172, "learning_rate": 4.9366779926560705e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1390 }, { "completion_length": 504.75, "epoch": 0.38553215077605324, "grad_norm": 0.3582320809364319, "kl": 0.04914230480790138, "learning_rate": 4.9365800463952604e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1391 }, { "completion_length": 562.0, "epoch": 0.3858093126385809, "grad_norm": 0.3225942552089691, "kl": 0.046568598598241806, "learning_rate": 4.936482025414677e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1392 }, { "completion_length": 513.75, "epoch": 0.38608647450110867, "grad_norm": 0.3263009190559387, "kl": 0.054059334099292755, "learning_rate": 4.9363839297173254e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1393 }, { "completion_length": 466.25, "epoch": 0.38636363636363635, "grad_norm": 0.35686641931533813, "kl": 0.09268882125616074, "learning_rate": 4.9362857593062145e-06, "loss": 0.0, "reward": 5.25, "reward_std": 0.5773502588272095, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1394 }, { "completion_length": 549.25, "epoch": 0.3866407982261641, "grad_norm": 0.3068579137325287, "kl": 0.054560303688049316, "learning_rate": 4.936187514184353e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1395 }, { "completion_length": 445.75, "epoch": 0.3869179600886918, "grad_norm": 0.39452892541885376, "kl": 0.06582757830619812, "learning_rate": 4.936089194354755e-06, "loss": 0.0, "reward": 4.0, "reward_std": 2.020725965499878, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1396 }, { "completion_length": 453.0, "epoch": 0.3871951219512195, "grad_norm": 0.3826623260974884, "kl": 0.05150391533970833, "learning_rate": 4.9359907998204344e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1397 }, { "completion_length": 572.5, "epoch": 0.3874722838137472, "grad_norm": 0.0, "kl": 0.056709788739681244, "learning_rate": 4.935892330584411e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1398 }, { "completion_length": 514.75, "epoch": 0.38774944567627495, "grad_norm": 0.3710484206676483, "kl": 0.08306461572647095, "learning_rate": 4.935793786649702e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1399 }, { "completion_length": 529.75, "epoch": 0.38802660753880264, "grad_norm": 0.37997063994407654, "kl": 0.05177651718258858, "learning_rate": 4.9356951680193304e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1400 }, { "completion_length": 464.0, "epoch": 0.3883037694013304, "grad_norm": 0.0, "kl": 0.04102465137839317, "learning_rate": 4.93559647469632e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1401 }, { "completion_length": 452.0, "epoch": 0.3885809312638581, "grad_norm": 0.42068544030189514, "kl": 0.052285097539424896, "learning_rate": 4.935497706683698e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1402 }, { "completion_length": 458.0, "epoch": 0.3888580931263858, "grad_norm": 0.34591326117515564, "kl": 0.0673544704914093, "learning_rate": 4.935398863984493e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1403 }, { "completion_length": 480.25, "epoch": 0.38913525498891355, "grad_norm": 0.4287869334220886, "kl": 0.07409847527742386, "learning_rate": 4.935299946601735e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1404 }, { "completion_length": 513.75, "epoch": 0.38941241685144123, "grad_norm": 0.0, "kl": 0.05252080783247948, "learning_rate": 4.9352009545384585e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1405 }, { "completion_length": 506.0, "epoch": 0.389689578713969, "grad_norm": 0.39282336831092834, "kl": 0.057282429188489914, "learning_rate": 4.935101887797699e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1406 }, { "completion_length": 451.0, "epoch": 0.38996674057649666, "grad_norm": 0.44476187229156494, "kl": 0.05306202918291092, "learning_rate": 4.9350027463824944e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1407 }, { "completion_length": 411.0, "epoch": 0.3902439024390244, "grad_norm": 0.6219346523284912, "kl": 0.05120658874511719, "learning_rate": 4.9349035302958846e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1408 }, { "completion_length": 477.25, "epoch": 0.3905210643015521, "grad_norm": 0.36947575211524963, "kl": 0.06199745461344719, "learning_rate": 4.934804239540913e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1409 }, { "completion_length": 517.0, "epoch": 0.39079822616407983, "grad_norm": 0.358614444732666, "kl": 0.048207998275756836, "learning_rate": 4.9347048741206235e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1410 }, { "completion_length": 515.25, "epoch": 0.3910753880266075, "grad_norm": 0.3421225845813751, "kl": 0.05311097949743271, "learning_rate": 4.934605434038064e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1411 }, { "completion_length": 511.0, "epoch": 0.39135254988913526, "grad_norm": 0.0, "kl": 0.06145704910159111, "learning_rate": 4.934505919296284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1412 }, { "completion_length": 516.75, "epoch": 0.39162971175166295, "grad_norm": 0.46646392345428467, "kl": 0.04775751754641533, "learning_rate": 4.9344063298983345e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1413 }, { "completion_length": 500.0, "epoch": 0.3919068736141907, "grad_norm": 0.3525021970272064, "kl": 0.06506999582052231, "learning_rate": 4.934306665847269e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1414 }, { "completion_length": 465.5, "epoch": 0.39218403547671843, "grad_norm": 0.36213064193725586, "kl": 0.1458076387643814, "learning_rate": 4.934206927146145e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1415 }, { "completion_length": 522.25, "epoch": 0.3924611973392461, "grad_norm": 0.32537582516670227, "kl": 0.050606049597263336, "learning_rate": 4.93410711379802e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1416 }, { "completion_length": 435.0, "epoch": 0.39273835920177386, "grad_norm": 0.32321953773498535, "kl": 0.058476001024246216, "learning_rate": 4.934007225805956e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1417 }, { "completion_length": 463.5, "epoch": 0.39301552106430154, "grad_norm": 0.3613390028476715, "kl": 0.053820956498384476, "learning_rate": 4.933907263173016e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1418 }, { "completion_length": 415.25, "epoch": 0.3932926829268293, "grad_norm": 0.3773825168609619, "kl": 0.08271601051092148, "learning_rate": 4.933807225902265e-06, "loss": 0.0, "reward": 2.625, "reward_std": 1.4361406564712524, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1419 }, { "completion_length": 489.75, "epoch": 0.39356984478935697, "grad_norm": 0.37435147166252136, "kl": 0.06672045588493347, "learning_rate": 4.9337071139967695e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1420 }, { "completion_length": 442.25, "epoch": 0.3938470066518847, "grad_norm": 0.39475390315055847, "kl": 0.062207289040088654, "learning_rate": 4.9336069274596025e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1421 }, { "completion_length": 506.0, "epoch": 0.3941241685144124, "grad_norm": 0.36808091402053833, "kl": 0.0577937476336956, "learning_rate": 4.933506666293834e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1422 }, { "completion_length": 522.25, "epoch": 0.39440133037694014, "grad_norm": 0.4030013680458069, "kl": 0.05725213885307312, "learning_rate": 4.933406330502538e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1423 }, { "completion_length": 520.0, "epoch": 0.3946784922394678, "grad_norm": 0.0, "kl": 0.05627312883734703, "learning_rate": 4.933305920088794e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1424 }, { "completion_length": 489.0, "epoch": 0.39495565410199557, "grad_norm": 0.35317012667655945, "kl": 0.0494144968688488, "learning_rate": 4.933205435055679e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1425 }, { "completion_length": 446.5, "epoch": 0.39523281596452325, "grad_norm": 0.38168638944625854, "kl": 0.38674676418304443, "learning_rate": 4.933104875406275e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1426 }, { "completion_length": 467.25, "epoch": 0.395509977827051, "grad_norm": 0.3532910645008087, "kl": 0.05125485733151436, "learning_rate": 4.933004241143667e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1427 }, { "completion_length": 417.5, "epoch": 0.39578713968957874, "grad_norm": 0.0, "kl": 0.0660596713423729, "learning_rate": 4.932903532270939e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1428 }, { "completion_length": 471.25, "epoch": 0.3960643015521064, "grad_norm": 0.38469448685646057, "kl": 0.057430658489465714, "learning_rate": 4.93280274879118e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1429 }, { "completion_length": 466.0, "epoch": 0.39634146341463417, "grad_norm": 0.36801913380622864, "kl": 0.08663614839315414, "learning_rate": 4.932701890707482e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1430 }, { "completion_length": 464.0, "epoch": 0.39661862527716185, "grad_norm": 0.3849039077758789, "kl": 0.05329512432217598, "learning_rate": 4.932600958022936e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1431 }, { "completion_length": 435.75, "epoch": 0.3968957871396896, "grad_norm": 0.37859389185905457, "kl": 0.06592854112386703, "learning_rate": 4.932499950740639e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1432 }, { "completion_length": 554.5, "epoch": 0.3971729490022173, "grad_norm": 0.36667194962501526, "kl": 0.04753952473402023, "learning_rate": 4.932398868863687e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1433 }, { "completion_length": 534.5, "epoch": 0.397450110864745, "grad_norm": 0.3143174648284912, "kl": 0.059291090816259384, "learning_rate": 4.93229771239518e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1434 }, { "completion_length": 495.75, "epoch": 0.3977272727272727, "grad_norm": 0.3720419108867645, "kl": 0.05302039533853531, "learning_rate": 4.93219648133822e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1435 }, { "completion_length": 490.75, "epoch": 0.39800443458980045, "grad_norm": 0.3090667128562927, "kl": 0.08245391398668289, "learning_rate": 4.932095175695911e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1436 }, { "completion_length": 516.5, "epoch": 0.39828159645232813, "grad_norm": 0.3718932271003723, "kl": 0.05057603120803833, "learning_rate": 4.931993795471361e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1437 }, { "completion_length": 484.25, "epoch": 0.3985587583148559, "grad_norm": 0.3460092544555664, "kl": 0.05695844441652298, "learning_rate": 4.931892340667679e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1438 }, { "completion_length": 522.25, "epoch": 0.3988359201773836, "grad_norm": 0.3434421420097351, "kl": 0.053326766937971115, "learning_rate": 4.931790811287974e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1439 }, { "completion_length": 490.0, "epoch": 0.3991130820399113, "grad_norm": 0.31711214780807495, "kl": 0.052056487649679184, "learning_rate": 4.931689207335362e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1440 }, { "completion_length": 456.5, "epoch": 0.39939024390243905, "grad_norm": 0.3889160454273224, "kl": 0.05660941079258919, "learning_rate": 4.931587528812957e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1441 }, { "completion_length": 496.25, "epoch": 0.39966740576496673, "grad_norm": 0.0, "kl": 0.04918017238378525, "learning_rate": 4.931485775723878e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1442 }, { "completion_length": 409.75, "epoch": 0.3999445676274945, "grad_norm": 0.46599385142326355, "kl": 0.1001264676451683, "learning_rate": 4.931383948071244e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1443 }, { "completion_length": 423.75, "epoch": 0.40022172949002216, "grad_norm": 0.383283406496048, "kl": 0.061741702258586884, "learning_rate": 4.931282045858179e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1444 }, { "completion_length": 515.75, "epoch": 0.4004988913525499, "grad_norm": 0.3696788251399994, "kl": 0.05937353894114494, "learning_rate": 4.931180069087808e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1445 }, { "completion_length": 474.75, "epoch": 0.4007760532150776, "grad_norm": 0.0, "kl": 0.09788341075181961, "learning_rate": 4.9310780177632575e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1446 }, { "completion_length": 459.0, "epoch": 0.40105321507760533, "grad_norm": 0.36394593119621277, "kl": 0.08033008873462677, "learning_rate": 4.930975891887657e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1447 }, { "completion_length": 488.25, "epoch": 0.401330376940133, "grad_norm": 0.39578568935394287, "kl": 0.06000912934541702, "learning_rate": 4.9308736914641385e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1448 }, { "completion_length": 540.75, "epoch": 0.40160753880266076, "grad_norm": 0.3828272521495819, "kl": 0.072372667491436, "learning_rate": 4.930771416495836e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1449 }, { "completion_length": 514.75, "epoch": 0.40188470066518844, "grad_norm": 0.3506024479866028, "kl": 0.052970144897699356, "learning_rate": 4.930669066985887e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1450 }, { "completion_length": 420.75, "epoch": 0.4021618625277162, "grad_norm": 0.3717309534549713, "kl": 0.05558980256319046, "learning_rate": 4.930566642937428e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1451 }, { "completion_length": 525.25, "epoch": 0.4024390243902439, "grad_norm": 0.3540106415748596, "kl": 0.060170505195856094, "learning_rate": 4.9304641443536015e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1452 }, { "completion_length": 419.75, "epoch": 0.4027161862527716, "grad_norm": 0.41150712966918945, "kl": 0.06657290458679199, "learning_rate": 4.9303615712375494e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1453 }, { "completion_length": 519.75, "epoch": 0.40299334811529935, "grad_norm": 0.0, "kl": 0.05340346321463585, "learning_rate": 4.9302589235924185e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1454 }, { "completion_length": 462.5, "epoch": 0.40327050997782704, "grad_norm": 0.37610819935798645, "kl": 0.05330757051706314, "learning_rate": 4.930156201421356e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1455 }, { "completion_length": 480.25, "epoch": 0.4035476718403548, "grad_norm": 0.38109898567199707, "kl": 0.0522647351026535, "learning_rate": 4.930053404727512e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1456 }, { "completion_length": 416.5, "epoch": 0.40382483370288247, "grad_norm": 0.38076144456863403, "kl": 0.07566798478364944, "learning_rate": 4.929950533514039e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1457 }, { "completion_length": 395.25, "epoch": 0.4041019955654102, "grad_norm": 0.5030560493469238, "kl": 0.06192229315638542, "learning_rate": 4.929847587784091e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1458 }, { "completion_length": 478.25, "epoch": 0.4043791574279379, "grad_norm": 0.36319825053215027, "kl": 0.06454234570264816, "learning_rate": 4.929744567540826e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1459 }, { "completion_length": 465.5, "epoch": 0.40465631929046564, "grad_norm": 0.42256441712379456, "kl": 0.05334483087062836, "learning_rate": 4.929641472787402e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1460 }, { "completion_length": 458.25, "epoch": 0.4049334811529933, "grad_norm": 0.8030058145523071, "kl": 0.05675234645605087, "learning_rate": 4.929538303526982e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1461 }, { "completion_length": 471.0, "epoch": 0.40521064301552107, "grad_norm": 0.36924996972084045, "kl": 0.054916754364967346, "learning_rate": 4.929435059762729e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1462 }, { "completion_length": 454.75, "epoch": 0.4054878048780488, "grad_norm": 0.0, "kl": 0.07992121577262878, "learning_rate": 4.929331741497807e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1463 }, { "completion_length": 509.5, "epoch": 0.4057649667405765, "grad_norm": 0.0, "kl": 0.06486242264509201, "learning_rate": 4.929228348735387e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1464 }, { "completion_length": 415.75, "epoch": 0.40604212860310424, "grad_norm": 0.39422905445098877, "kl": 0.07101328670978546, "learning_rate": 4.92912488147864e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1465 }, { "completion_length": 425.75, "epoch": 0.4063192904656319, "grad_norm": 0.0, "kl": 0.0683961883187294, "learning_rate": 4.929021339730737e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1466 }, { "completion_length": 481.75, "epoch": 0.40659645232815966, "grad_norm": 0.3863722085952759, "kl": 0.06922660768032074, "learning_rate": 4.928917723494854e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1467 }, { "completion_length": 513.75, "epoch": 0.40687361419068735, "grad_norm": 0.37050706148147583, "kl": 0.06085878610610962, "learning_rate": 4.928814032774169e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1468 }, { "completion_length": 438.0, "epoch": 0.4071507760532151, "grad_norm": 0.4291159510612488, "kl": 0.06410027295351028, "learning_rate": 4.92871026757186e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1469 }, { "completion_length": 523.75, "epoch": 0.4074279379157428, "grad_norm": 0.3589335083961487, "kl": 0.0683947429060936, "learning_rate": 4.928606427891112e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1470 }, { "completion_length": 451.5, "epoch": 0.4077050997782705, "grad_norm": 0.0, "kl": 0.07920465618371964, "learning_rate": 4.9285025137351065e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1471 }, { "completion_length": 475.0, "epoch": 0.4079822616407982, "grad_norm": 0.40855732560157776, "kl": 0.0633372962474823, "learning_rate": 4.928398525107031e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1472 }, { "completion_length": 513.5, "epoch": 0.40825942350332595, "grad_norm": 0.3667266070842743, "kl": 0.06207367405295372, "learning_rate": 4.928294462010075e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1473 }, { "completion_length": 583.25, "epoch": 0.40853658536585363, "grad_norm": 0.3451665937900543, "kl": 0.06113618612289429, "learning_rate": 4.92819032444743e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1474 }, { "completion_length": 454.5, "epoch": 0.4088137472283814, "grad_norm": 2.588773250579834, "kl": 0.053184740245342255, "learning_rate": 4.928086112422288e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1475 }, { "completion_length": 415.75, "epoch": 0.4090909090909091, "grad_norm": 0.4762076437473297, "kl": 0.07845556735992432, "learning_rate": 4.927981825937846e-06, "loss": 0.0, "reward": 5.5625, "reward_std": 0.375, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 1476 }, { "completion_length": 532.75, "epoch": 0.4093680709534368, "grad_norm": 0.3331492841243744, "kl": 0.07458021491765976, "learning_rate": 4.9278774649973e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1477 }, { "completion_length": 495.75, "epoch": 0.40964523281596454, "grad_norm": 0.3767250180244446, "kl": 0.07240107655525208, "learning_rate": 4.927773029603853e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1478 }, { "completion_length": 467.0, "epoch": 0.40992239467849223, "grad_norm": 0.0, "kl": 0.07169927656650543, "learning_rate": 4.927668519760707e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1479 }, { "completion_length": 546.75, "epoch": 0.41019955654101997, "grad_norm": 0.3423292934894562, "kl": 0.050885654985904694, "learning_rate": 4.927563935471066e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1480 }, { "completion_length": 489.75, "epoch": 0.41047671840354766, "grad_norm": 0.6697098612785339, "kl": 0.06390194594860077, "learning_rate": 4.927459276738137e-06, "loss": -0.0, "reward": 4.59375, "reward_std": 1.9185905456542969, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 1481 }, { "completion_length": 489.25, "epoch": 0.4107538802660754, "grad_norm": 0.3999980092048645, "kl": 0.0669020488858223, "learning_rate": 4.927354543565131e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1482 }, { "completion_length": 460.75, "epoch": 0.4110310421286031, "grad_norm": 0.0, "kl": 0.06348907202482224, "learning_rate": 4.927249735955257e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1483 }, { "completion_length": 479.75, "epoch": 0.4113082039911308, "grad_norm": 0.3600034713745117, "kl": 0.06960048526525497, "learning_rate": 4.9271448539117325e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1484 }, { "completion_length": 504.75, "epoch": 0.4115853658536585, "grad_norm": 0.37772029638290405, "kl": 0.05783412605524063, "learning_rate": 4.927039897437771e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1485 }, { "completion_length": 493.25, "epoch": 0.41186252771618626, "grad_norm": 0.3585945963859558, "kl": 0.0737360343337059, "learning_rate": 4.926934866536592e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1486 }, { "completion_length": 386.75, "epoch": 0.41213968957871394, "grad_norm": 0.4488345682621002, "kl": 0.06354516744613647, "learning_rate": 4.926829761211417e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1487 }, { "completion_length": 526.0, "epoch": 0.4124168514412417, "grad_norm": 0.35952064394950867, "kl": 0.05762782320380211, "learning_rate": 4.926724581465468e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1488 }, { "completion_length": 374.0, "epoch": 0.4126940133037694, "grad_norm": 0.0, "kl": 0.059258833527565, "learning_rate": 4.926619327301971e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1489 }, { "completion_length": 449.0, "epoch": 0.4129711751662971, "grad_norm": 0.0, "kl": 0.06589840352535248, "learning_rate": 4.926513998724155e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1490 }, { "completion_length": 449.5, "epoch": 0.41324833702882485, "grad_norm": 0.4047386944293976, "kl": 0.06479262560606003, "learning_rate": 4.926408595735247e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1491 }, { "completion_length": 440.5, "epoch": 0.41352549889135254, "grad_norm": 0.0, "kl": 0.05755314603447914, "learning_rate": 4.926303118338482e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1492 }, { "completion_length": 508.5, "epoch": 0.4138026607538803, "grad_norm": 0.3884876072406769, "kl": 0.056212540715932846, "learning_rate": 4.926197566537094e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1493 }, { "completion_length": 415.5, "epoch": 0.41407982261640797, "grad_norm": 0.0, "kl": 0.0765133872628212, "learning_rate": 4.926091940334318e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1494 }, { "completion_length": 507.5, "epoch": 0.4143569844789357, "grad_norm": 0.3790130615234375, "kl": 0.06859125941991806, "learning_rate": 4.9259862397333955e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1495 }, { "completion_length": 509.75, "epoch": 0.4146341463414634, "grad_norm": 0.44202619791030884, "kl": 0.06630643457174301, "learning_rate": 4.925880464737567e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1496 }, { "completion_length": 448.0, "epoch": 0.41491130820399114, "grad_norm": 0.0, "kl": 0.08065859973430634, "learning_rate": 4.925774615350076e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1497 }, { "completion_length": 444.5, "epoch": 0.4151884700665188, "grad_norm": 0.0, "kl": 0.06194448843598366, "learning_rate": 4.9256686915741665e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1498 }, { "completion_length": 379.75, "epoch": 0.41546563192904656, "grad_norm": 0.0, "kl": 0.0758570209145546, "learning_rate": 4.92556269341309e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1499 }, { "completion_length": 426.0, "epoch": 0.4157427937915743, "grad_norm": 0.45989924669265747, "kl": 0.07432479411363602, "learning_rate": 4.925456620870097e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1500 }, { "completion_length": 417.0, "epoch": 0.416019955654102, "grad_norm": 0.43361696600914, "kl": 0.15637077391147614, "learning_rate": 4.925350473948438e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1501 }, { "completion_length": 521.5, "epoch": 0.41629711751662973, "grad_norm": 0.39175644516944885, "kl": 0.062230050563812256, "learning_rate": 4.92524425265137e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1502 }, { "completion_length": 350.5, "epoch": 0.4165742793791574, "grad_norm": 0.0, "kl": 0.05041975900530815, "learning_rate": 4.925137956982149e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1503 }, { "completion_length": 465.25, "epoch": 0.41685144124168516, "grad_norm": 0.41629907488822937, "kl": 0.058239419013261795, "learning_rate": 4.925031586944035e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1504 }, { "completion_length": 459.5, "epoch": 0.41712860310421285, "grad_norm": 0.37644341588020325, "kl": 0.0824914500117302, "learning_rate": 4.92492514254029e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1505 }, { "completion_length": 503.0, "epoch": 0.4174057649667406, "grad_norm": 0.3591821491718292, "kl": 0.06019837036728859, "learning_rate": 4.924818623774178e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1506 }, { "completion_length": 513.0, "epoch": 0.4176829268292683, "grad_norm": 0.36020252108573914, "kl": 0.05485996976494789, "learning_rate": 4.924712030648967e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1507 }, { "completion_length": 445.5, "epoch": 0.417960088691796, "grad_norm": 0.0, "kl": 0.07731631398200989, "learning_rate": 4.924605363167924e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1508 }, { "completion_length": 402.75, "epoch": 0.4182372505543237, "grad_norm": 0.4360334575176239, "kl": 0.07367797195911407, "learning_rate": 4.9244986213343205e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1509 }, { "completion_length": 413.0, "epoch": 0.41851441241685144, "grad_norm": 0.0, "kl": 0.07778553664684296, "learning_rate": 4.92439180515143e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1510 }, { "completion_length": 409.0, "epoch": 0.41879157427937913, "grad_norm": 0.36151719093322754, "kl": 0.06868267059326172, "learning_rate": 4.924284914622528e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1511 }, { "completion_length": 455.5, "epoch": 0.4190687361419069, "grad_norm": 0.36264270544052124, "kl": 0.0626058354973793, "learning_rate": 4.924177949750893e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1512 }, { "completion_length": 460.25, "epoch": 0.4193458980044346, "grad_norm": 0.4455827474594116, "kl": 0.11809010803699493, "learning_rate": 4.924070910539804e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1513 }, { "completion_length": 488.25, "epoch": 0.4196230598669623, "grad_norm": 0.0, "kl": 0.06298693269491196, "learning_rate": 4.923963796992543e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1514 }, { "completion_length": 456.75, "epoch": 0.41990022172949004, "grad_norm": 0.41573366522789, "kl": 0.06875951588153839, "learning_rate": 4.923856609112397e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1515 }, { "completion_length": 460.75, "epoch": 0.42017738359201773, "grad_norm": 0.4103153347969055, "kl": 0.06490899622440338, "learning_rate": 4.923749346902652e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1516 }, { "completion_length": 420.5, "epoch": 0.42045454545454547, "grad_norm": 0.0, "kl": 0.06297354400157928, "learning_rate": 4.923642010366597e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1517 }, { "completion_length": 470.75, "epoch": 0.42073170731707316, "grad_norm": 0.402704656124115, "kl": 0.06311918050050735, "learning_rate": 4.923534599507524e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1518 }, { "completion_length": 513.75, "epoch": 0.4210088691796009, "grad_norm": 0.35568326711654663, "kl": 0.06603832542896271, "learning_rate": 4.923427114328725e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1519 }, { "completion_length": 447.25, "epoch": 0.4212860310421286, "grad_norm": 0.4693082869052887, "kl": 0.07735780626535416, "learning_rate": 4.9233195548334986e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1520 }, { "completion_length": 477.25, "epoch": 0.4215631929046563, "grad_norm": 0.3768734633922577, "kl": 0.08166074752807617, "learning_rate": 4.9232119210251426e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1521 }, { "completion_length": 323.0, "epoch": 0.421840354767184, "grad_norm": 0.5499096512794495, "kl": 0.2311004102230072, "learning_rate": 4.923104212906957e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1522 }, { "completion_length": 444.5, "epoch": 0.42211751662971175, "grad_norm": 0.39228177070617676, "kl": 0.06755849719047546, "learning_rate": 4.922996430482244e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1523 }, { "completion_length": 500.75, "epoch": 0.4223946784922395, "grad_norm": 0.36408883333206177, "kl": 0.05820838734507561, "learning_rate": 4.9228885737543115e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1524 }, { "completion_length": 424.0, "epoch": 0.4226718403547672, "grad_norm": 0.4329473674297333, "kl": 0.08738986402750015, "learning_rate": 4.922780642726465e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1525 }, { "completion_length": 484.5, "epoch": 0.4229490022172949, "grad_norm": 0.35237181186676025, "kl": 0.057271577417850494, "learning_rate": 4.922672637402014e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1526 }, { "completion_length": 416.75, "epoch": 0.4232261640798226, "grad_norm": 0.0, "kl": 0.07079701125621796, "learning_rate": 4.922564557784271e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1527 }, { "completion_length": 438.75, "epoch": 0.42350332594235035, "grad_norm": 0.0, "kl": 0.06718822568655014, "learning_rate": 4.922456403876552e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1528 }, { "completion_length": 448.75, "epoch": 0.42378048780487804, "grad_norm": 0.37005406618118286, "kl": 0.06497834622859955, "learning_rate": 4.922348175682171e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1529 }, { "completion_length": 443.0, "epoch": 0.4240576496674058, "grad_norm": 0.39332154393196106, "kl": 0.06735480576753616, "learning_rate": 4.922239873204449e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1530 }, { "completion_length": 452.75, "epoch": 0.42433481152993346, "grad_norm": 0.3639281690120697, "kl": 0.06982588022947311, "learning_rate": 4.922131496446706e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1531 }, { "completion_length": 409.25, "epoch": 0.4246119733924612, "grad_norm": 0.5537765622138977, "kl": 0.06721208989620209, "learning_rate": 4.922023045412266e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1532 }, { "completion_length": 466.25, "epoch": 0.4248891352549889, "grad_norm": 0.0, "kl": 0.07444163411855698, "learning_rate": 4.921914520104455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1533 }, { "completion_length": 489.75, "epoch": 0.42516629711751663, "grad_norm": 0.3565160930156708, "kl": 0.07392216473817825, "learning_rate": 4.9218059205266e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1534 }, { "completion_length": 504.0, "epoch": 0.4254434589800443, "grad_norm": 0.4122638702392578, "kl": 0.07507886737585068, "learning_rate": 4.921697246682032e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1535 }, { "completion_length": 527.75, "epoch": 0.42572062084257206, "grad_norm": 0.4207209646701813, "kl": 0.07633686810731888, "learning_rate": 4.921588498574083e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1536 }, { "completion_length": 380.75, "epoch": 0.4259977827050998, "grad_norm": 0.5064929127693176, "kl": 0.06725656241178513, "learning_rate": 4.921479676206089e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1537 }, { "completion_length": 445.25, "epoch": 0.4262749445676275, "grad_norm": 0.37066569924354553, "kl": 0.17622508108615875, "learning_rate": 4.921370779581386e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1538 }, { "completion_length": 406.0, "epoch": 0.42655210643015523, "grad_norm": 0.0, "kl": 0.08078837394714355, "learning_rate": 4.921261808703314e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1539 }, { "completion_length": 443.25, "epoch": 0.4268292682926829, "grad_norm": 0.3778543174266815, "kl": 0.0761023461818695, "learning_rate": 4.921152763575214e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1540 }, { "completion_length": 521.5, "epoch": 0.42710643015521066, "grad_norm": 0.4070785939693451, "kl": 0.056149374693632126, "learning_rate": 4.921043644200432e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1541 }, { "completion_length": 451.0, "epoch": 0.42738359201773835, "grad_norm": 0.45216992497444153, "kl": 0.09609866887331009, "learning_rate": 4.920934450582311e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1542 }, { "completion_length": 443.75, "epoch": 0.4276607538802661, "grad_norm": 0.5147321224212646, "kl": 0.056542329490184784, "learning_rate": 4.920825182724203e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1543 }, { "completion_length": 428.5, "epoch": 0.4279379157427938, "grad_norm": 0.4488507807254791, "kl": 0.06824062019586563, "learning_rate": 4.920715840629456e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1544 }, { "completion_length": 520.75, "epoch": 0.4282150776053215, "grad_norm": 0.3668448030948639, "kl": 0.06416673213243484, "learning_rate": 4.920606424301424e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1545 }, { "completion_length": 481.0, "epoch": 0.4284922394678492, "grad_norm": 0.40171125531196594, "kl": 0.10842398554086685, "learning_rate": 4.9204969337434615e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1546 }, { "completion_length": 449.75, "epoch": 0.42876940133037694, "grad_norm": 0.0, "kl": 0.07211615890264511, "learning_rate": 4.920387368958929e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1547 }, { "completion_length": 473.75, "epoch": 0.42904656319290463, "grad_norm": 0.3214860260486603, "kl": 0.11261209100484848, "learning_rate": 4.9202777299511826e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1548 }, { "completion_length": 392.25, "epoch": 0.42932372505543237, "grad_norm": 0.0, "kl": 0.07928167283535004, "learning_rate": 4.920168016723588e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1549 }, { "completion_length": 433.0, "epoch": 0.4296008869179601, "grad_norm": 0.3496089577674866, "kl": 0.07316854596138, "learning_rate": 4.920058229279507e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1550 }, { "completion_length": 344.75, "epoch": 0.4298780487804878, "grad_norm": 0.0, "kl": 0.06095442920923233, "learning_rate": 4.919948367622307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1551 }, { "completion_length": 386.5, "epoch": 0.43015521064301554, "grad_norm": 0.3839717507362366, "kl": 0.11321526020765305, "learning_rate": 4.919838431755359e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1552 }, { "completion_length": 524.0, "epoch": 0.4304323725055432, "grad_norm": 0.32637137174606323, "kl": 0.055539023131132126, "learning_rate": 4.919728421682031e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1553 }, { "completion_length": 431.0, "epoch": 0.43070953436807097, "grad_norm": 0.3743930459022522, "kl": 0.0598931759595871, "learning_rate": 4.9196183374056985e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1554 }, { "completion_length": 435.25, "epoch": 0.43098669623059865, "grad_norm": 0.0, "kl": 0.07895210385322571, "learning_rate": 4.919508178929737e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1555 }, { "completion_length": 403.0, "epoch": 0.4312638580931264, "grad_norm": 0.0, "kl": 0.05956922098994255, "learning_rate": 4.919397946257525e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1556 }, { "completion_length": 490.5, "epoch": 0.4315410199556541, "grad_norm": 0.0, "kl": 0.06866301596164703, "learning_rate": 4.919287639392442e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1557 }, { "completion_length": 441.5, "epoch": 0.4318181818181818, "grad_norm": 0.38694050908088684, "kl": 0.06213100627064705, "learning_rate": 4.91917725833787e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1558 }, { "completion_length": 395.25, "epoch": 0.4320953436807095, "grad_norm": 0.3793298304080963, "kl": 0.06346035748720169, "learning_rate": 4.919066803097197e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1559 }, { "completion_length": 535.75, "epoch": 0.43237250554323725, "grad_norm": 0.3728235960006714, "kl": 0.056326739490032196, "learning_rate": 4.918956273673807e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1560 }, { "completion_length": 405.25, "epoch": 0.432649667405765, "grad_norm": 0.4192271828651428, "kl": 0.1350809633731842, "learning_rate": 4.918845670071091e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1561 }, { "completion_length": 477.0, "epoch": 0.4329268292682927, "grad_norm": 0.3228679597377777, "kl": 0.05742420256137848, "learning_rate": 4.91873499229244e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1562 }, { "completion_length": 367.0, "epoch": 0.4332039911308204, "grad_norm": 0.0, "kl": 0.06761697679758072, "learning_rate": 4.918624240341249e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1563 }, { "completion_length": 565.0, "epoch": 0.4334811529933481, "grad_norm": 0.0, "kl": 0.06581741571426392, "learning_rate": 4.918513414220914e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1564 }, { "completion_length": 440.0, "epoch": 0.43375831485587585, "grad_norm": 0.3996034562587738, "kl": 0.06599660217761993, "learning_rate": 4.918402513934833e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1565 }, { "completion_length": 447.75, "epoch": 0.43403547671840353, "grad_norm": 0.0, "kl": 0.06911108642816544, "learning_rate": 4.918291539486407e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1566 }, { "completion_length": 456.25, "epoch": 0.4343126385809313, "grad_norm": 0.0, "kl": 0.1074269488453865, "learning_rate": 4.91818049087904e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1567 }, { "completion_length": 445.75, "epoch": 0.43458980044345896, "grad_norm": 0.46449920535087585, "kl": 0.06023138388991356, "learning_rate": 4.918069368116136e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1568 }, { "completion_length": 513.75, "epoch": 0.4348669623059867, "grad_norm": 0.4508836567401886, "kl": 0.05595502629876137, "learning_rate": 4.917958171201104e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1569 }, { "completion_length": 548.25, "epoch": 0.4351441241685144, "grad_norm": 0.3586242198944092, "kl": 0.0805191844701767, "learning_rate": 4.917846900137353e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1570 }, { "completion_length": 516.0, "epoch": 0.43542128603104213, "grad_norm": 0.5993136763572693, "kl": 0.06550391018390656, "learning_rate": 4.917735554928296e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1571 }, { "completion_length": 436.25, "epoch": 0.4356984478935698, "grad_norm": 0.4097466468811035, "kl": 0.07512379437685013, "learning_rate": 4.917624135577346e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1572 }, { "completion_length": 401.75, "epoch": 0.43597560975609756, "grad_norm": 0.4032766819000244, "kl": 0.06163035333156586, "learning_rate": 4.917512642087922e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1573 }, { "completion_length": 445.75, "epoch": 0.4362527716186253, "grad_norm": 0.38199830055236816, "kl": 0.0685831606388092, "learning_rate": 4.917401074463441e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1574 }, { "completion_length": 465.0, "epoch": 0.436529933481153, "grad_norm": 0.0, "kl": 0.06559338420629501, "learning_rate": 4.917289432707326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1575 }, { "completion_length": 457.5, "epoch": 0.43680709534368073, "grad_norm": 0.0, "kl": 0.0576230064034462, "learning_rate": 4.9171777168229986e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1576 }, { "completion_length": 473.0, "epoch": 0.4370842572062084, "grad_norm": 0.38731104135513306, "kl": 0.07873260229825974, "learning_rate": 4.917065926813887e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1577 }, { "completion_length": 438.0, "epoch": 0.43736141906873616, "grad_norm": 0.38116443157196045, "kl": 0.05217940732836723, "learning_rate": 4.916954062683418e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1578 }, { "completion_length": 430.25, "epoch": 0.43763858093126384, "grad_norm": 0.3713483214378357, "kl": 0.072413370013237, "learning_rate": 4.916842124435022e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1579 }, { "completion_length": 414.5, "epoch": 0.4379157427937916, "grad_norm": 0.0, "kl": 0.08273807168006897, "learning_rate": 4.9167301120721315e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1580 }, { "completion_length": 466.25, "epoch": 0.43819290465631927, "grad_norm": 0.40991950035095215, "kl": 0.05601911246776581, "learning_rate": 4.916618025598182e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1581 }, { "completion_length": 485.25, "epoch": 0.438470066518847, "grad_norm": 0.3959043323993683, "kl": 0.06641871482133865, "learning_rate": 4.916505865016611e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1582 }, { "completion_length": 523.25, "epoch": 0.4387472283813747, "grad_norm": 0.0, "kl": 0.05080302059650421, "learning_rate": 4.916393630330857e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1583 }, { "completion_length": 419.5, "epoch": 0.43902439024390244, "grad_norm": 0.0, "kl": 0.09596148133277893, "learning_rate": 4.916281321544362e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1584 }, { "completion_length": 416.5, "epoch": 0.4393015521064302, "grad_norm": 0.0, "kl": 0.07501459121704102, "learning_rate": 4.916168938660571e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1585 }, { "completion_length": 428.25, "epoch": 0.43957871396895787, "grad_norm": 0.0, "kl": 0.07951947301626205, "learning_rate": 4.916056481682929e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1586 }, { "completion_length": 470.75, "epoch": 0.4398558758314856, "grad_norm": 0.6926064491271973, "kl": 0.05839166417717934, "learning_rate": 4.915943950614887e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1587 }, { "completion_length": 493.0, "epoch": 0.4401330376940133, "grad_norm": 0.0, "kl": 0.06568673998117447, "learning_rate": 4.915831345459892e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1588 }, { "completion_length": 409.75, "epoch": 0.44041019955654104, "grad_norm": 0.0, "kl": 0.10538529604673386, "learning_rate": 4.9157186662214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1589 }, { "completion_length": 502.5, "epoch": 0.4406873614190687, "grad_norm": 0.0, "kl": 0.054769497364759445, "learning_rate": 4.915605912902866e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1590 }, { "completion_length": 356.5, "epoch": 0.44096452328159647, "grad_norm": 0.4999082684516907, "kl": 0.05984360724687576, "learning_rate": 4.915493085507747e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1591 }, { "completion_length": 471.75, "epoch": 0.44124168514412415, "grad_norm": 0.4109421670436859, "kl": 0.07904750108718872, "learning_rate": 4.915380184039504e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1592 }, { "completion_length": 459.75, "epoch": 0.4415188470066519, "grad_norm": 0.0, "kl": 0.06902697682380676, "learning_rate": 4.9152672085015975e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1593 }, { "completion_length": 447.75, "epoch": 0.4417960088691796, "grad_norm": 0.40944164991378784, "kl": 0.06602191925048828, "learning_rate": 4.915154158897492e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1594 }, { "completion_length": 558.5, "epoch": 0.4420731707317073, "grad_norm": 0.3126741051673889, "kl": 0.059627972543239594, "learning_rate": 4.9150410352306575e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.375, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1595 }, { "completion_length": 519.5, "epoch": 0.442350332594235, "grad_norm": 0.3231665790081024, "kl": 0.05933857709169388, "learning_rate": 4.914927837504559e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1596 }, { "completion_length": 465.0, "epoch": 0.44262749445676275, "grad_norm": 0.3869689106941223, "kl": 0.05747104436159134, "learning_rate": 4.914814565722671e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1597 }, { "completion_length": 428.25, "epoch": 0.4429046563192905, "grad_norm": 0.0, "kl": 0.05756913498044014, "learning_rate": 4.9147012198884656e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1598 }, { "completion_length": 428.25, "epoch": 0.4431818181818182, "grad_norm": 0.4156346917152405, "kl": 0.08368752151727676, "learning_rate": 4.914587800005418e-06, "loss": -0.0, "reward": 3.59375, "reward_std": 2.5028629302978516, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 1599 }, { "completion_length": 412.75, "epoch": 0.4434589800443459, "grad_norm": 0.0, "kl": 0.060815874487161636, "learning_rate": 4.914474306077007e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1600 }, { "completion_length": 484.25, "epoch": 0.4437361419068736, "grad_norm": 0.38930460810661316, "kl": 0.06516031175851822, "learning_rate": 4.914360738106713e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1601 }, { "completion_length": 416.75, "epoch": 0.44401330376940135, "grad_norm": 0.48254042863845825, "kl": 0.08361515402793884, "learning_rate": 4.914247096098019e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1602 }, { "completion_length": 556.5, "epoch": 0.44429046563192903, "grad_norm": 0.32330912351608276, "kl": 0.058469608426094055, "learning_rate": 4.914133380054409e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1603 }, { "completion_length": 452.75, "epoch": 0.4445676274944568, "grad_norm": 0.0, "kl": 0.06690546125173569, "learning_rate": 4.9140195899793705e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1604 }, { "completion_length": 412.5, "epoch": 0.44484478935698446, "grad_norm": 0.4345249533653259, "kl": 0.05807683989405632, "learning_rate": 4.9139057258763945e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1605 }, { "completion_length": 487.75, "epoch": 0.4451219512195122, "grad_norm": 0.4020804464817047, "kl": 0.05784706026315689, "learning_rate": 4.913791787748971e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1606 }, { "completion_length": 510.0, "epoch": 0.4453991130820399, "grad_norm": 0.38004061579704285, "kl": 0.07139789313077927, "learning_rate": 4.913677775600594e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1607 }, { "completion_length": 396.5, "epoch": 0.44567627494456763, "grad_norm": 0.0, "kl": 0.07200035452842712, "learning_rate": 4.913563689434761e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1608 }, { "completion_length": 489.25, "epoch": 0.44595343680709537, "grad_norm": 0.4245154857635498, "kl": 0.0698883906006813, "learning_rate": 4.9134495292549694e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1609 }, { "completion_length": 407.0, "epoch": 0.44623059866962306, "grad_norm": 0.4195176661014557, "kl": 0.08760451525449753, "learning_rate": 4.913335295064721e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1610 }, { "completion_length": 469.25, "epoch": 0.4465077605321508, "grad_norm": 0.3675207495689392, "kl": 0.06734029203653336, "learning_rate": 4.913220986867517e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1611 }, { "completion_length": 469.25, "epoch": 0.4467849223946785, "grad_norm": 0.4516620934009552, "kl": 0.06647118926048279, "learning_rate": 4.913106604666865e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1612 }, { "completion_length": 451.75, "epoch": 0.4470620842572062, "grad_norm": 0.0, "kl": 0.057424090802669525, "learning_rate": 4.9129921484662714e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1613 }, { "completion_length": 446.0, "epoch": 0.4473392461197339, "grad_norm": 0.3681809604167938, "kl": 0.055982112884521484, "learning_rate": 4.912877618269247e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1614 }, { "completion_length": 433.25, "epoch": 0.44761640798226165, "grad_norm": 0.4165467619895935, "kl": 0.10248462110757828, "learning_rate": 4.912763014079303e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1615 }, { "completion_length": 410.25, "epoch": 0.44789356984478934, "grad_norm": 0.4211660325527191, "kl": 0.07109431177377701, "learning_rate": 4.912648335899955e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1616 }, { "completion_length": 467.75, "epoch": 0.4481707317073171, "grad_norm": 0.43013665080070496, "kl": 0.07203447073698044, "learning_rate": 4.912533583734718e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1617 }, { "completion_length": 398.0, "epoch": 0.44844789356984477, "grad_norm": 0.5203729867935181, "kl": 0.08326759189367294, "learning_rate": 4.9124187575871115e-06, "loss": -0.0, "reward": 2.4375, "reward_std": 2.285598039627075, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 1618 }, { "completion_length": 520.25, "epoch": 0.4487250554323725, "grad_norm": 0.3566998243331909, "kl": 0.07379839569330215, "learning_rate": 4.9123038574606575e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1619 }, { "completion_length": 509.25, "epoch": 0.4490022172949002, "grad_norm": 0.34245434403419495, "kl": 0.06675250083208084, "learning_rate": 4.912188883358879e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1620 }, { "completion_length": 506.0, "epoch": 0.44927937915742794, "grad_norm": 0.39038002490997314, "kl": 0.06085076183080673, "learning_rate": 4.912073835285303e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1621 }, { "completion_length": 541.25, "epoch": 0.4495565410199557, "grad_norm": 0.3506198823451996, "kl": 0.0713835135102272, "learning_rate": 4.911958713243455e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1622 }, { "completion_length": 449.25, "epoch": 0.44983370288248337, "grad_norm": 0.41286689043045044, "kl": 0.07371244579553604, "learning_rate": 4.911843517236867e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1623 }, { "completion_length": 500.0, "epoch": 0.4501108647450111, "grad_norm": 0.0, "kl": 0.06566787511110306, "learning_rate": 4.911728247269072e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1624 }, { "completion_length": 518.75, "epoch": 0.4503880266075388, "grad_norm": 0.3412729799747467, "kl": 0.06134812906384468, "learning_rate": 4.911612903343604e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1625 }, { "completion_length": 456.25, "epoch": 0.45066518847006654, "grad_norm": 0.4574596881866455, "kl": 0.07907947897911072, "learning_rate": 4.911497485464e-06, "loss": -0.0, "reward": 4.0, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1626 }, { "completion_length": 483.0, "epoch": 0.4509423503325942, "grad_norm": 0.4052596092224121, "kl": 0.07002019882202148, "learning_rate": 4.9113819936337995e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1627 }, { "completion_length": 412.5, "epoch": 0.45121951219512196, "grad_norm": 0.4218981862068176, "kl": 0.0861450657248497, "learning_rate": 4.9112664278565445e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1628 }, { "completion_length": 496.25, "epoch": 0.45149667405764965, "grad_norm": 0.0, "kl": 0.08902566134929657, "learning_rate": 4.91115078813578e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1629 }, { "completion_length": 384.25, "epoch": 0.4517738359201774, "grad_norm": 0.0, "kl": 0.2194235622882843, "learning_rate": 4.911035074475049e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1630 }, { "completion_length": 424.0, "epoch": 0.4520509977827051, "grad_norm": 0.47285497188568115, "kl": 0.07446150481700897, "learning_rate": 4.910919286877903e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1631 }, { "completion_length": 403.5, "epoch": 0.4523281596452328, "grad_norm": 0.0, "kl": 0.0998433530330658, "learning_rate": 4.910803425347892e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1632 }, { "completion_length": 503.0, "epoch": 0.4526053215077605, "grad_norm": 0.3606918454170227, "kl": 0.08296451717615128, "learning_rate": 4.910687489888568e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1633 }, { "completion_length": 459.75, "epoch": 0.45288248337028825, "grad_norm": 0.4067144989967346, "kl": 0.07700220495462418, "learning_rate": 4.910571480503487e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1634 }, { "completion_length": 417.5, "epoch": 0.453159645232816, "grad_norm": 0.39108172059059143, "kl": 0.0739208534359932, "learning_rate": 4.910455397196206e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1635 }, { "completion_length": 469.5, "epoch": 0.4534368070953437, "grad_norm": 0.40280815958976746, "kl": 0.09339854121208191, "learning_rate": 4.910339239970286e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1636 }, { "completion_length": 571.75, "epoch": 0.4537139689578714, "grad_norm": 0.2579387426376343, "kl": 0.058460116386413574, "learning_rate": 4.910223008829288e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1637 }, { "completion_length": 455.75, "epoch": 0.4539911308203991, "grad_norm": 0.5015838146209717, "kl": 0.1045583039522171, "learning_rate": 4.910106703776777e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1638 }, { "completion_length": 440.75, "epoch": 0.45426829268292684, "grad_norm": 0.4221779406070709, "kl": 0.08165937662124634, "learning_rate": 4.909990324816319e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1639 }, { "completion_length": 452.0, "epoch": 0.45454545454545453, "grad_norm": 0.3967173099517822, "kl": 0.06424779444932938, "learning_rate": 4.909873871951483e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1640 }, { "completion_length": 463.5, "epoch": 0.45482261640798227, "grad_norm": 0.0, "kl": 0.07741525024175644, "learning_rate": 4.90975734518584e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1641 }, { "completion_length": 531.75, "epoch": 0.45509977827050996, "grad_norm": 0.3718971312046051, "kl": 0.061826545745134354, "learning_rate": 4.909640744522963e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1642 }, { "completion_length": 453.75, "epoch": 0.4553769401330377, "grad_norm": 0.0, "kl": 0.08572742342948914, "learning_rate": 4.909524069966429e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1643 }, { "completion_length": 469.25, "epoch": 0.4556541019955654, "grad_norm": 0.362407386302948, "kl": 0.07865921407938004, "learning_rate": 4.909407321519815e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1644 }, { "completion_length": 410.75, "epoch": 0.4559312638580931, "grad_norm": 0.38003674149513245, "kl": 0.07934208959341049, "learning_rate": 4.909290499186701e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1645 }, { "completion_length": 468.75, "epoch": 0.45620842572062087, "grad_norm": 0.0, "kl": 0.08174817264080048, "learning_rate": 4.90917360297067e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1646 }, { "completion_length": 497.75, "epoch": 0.45648558758314856, "grad_norm": 0.39631059765815735, "kl": 0.07118058204650879, "learning_rate": 4.909056632875307e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1647 }, { "completion_length": 485.0, "epoch": 0.4567627494456763, "grad_norm": 0.3838047981262207, "kl": 0.07264430075883865, "learning_rate": 4.908939588904198e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1648 }, { "completion_length": 411.5, "epoch": 0.457039911308204, "grad_norm": 0.5040830373764038, "kl": 0.08149221539497375, "learning_rate": 4.908822471060932e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1649 }, { "completion_length": 496.25, "epoch": 0.4573170731707317, "grad_norm": 0.0, "kl": 0.08406127989292145, "learning_rate": 4.908705279349103e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1650 }, { "completion_length": 432.25, "epoch": 0.4575942350332594, "grad_norm": 0.3891690671443939, "kl": 0.08459283411502838, "learning_rate": 4.908588013772302e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1651 }, { "completion_length": 447.0, "epoch": 0.45787139689578715, "grad_norm": 0.0, "kl": 0.07521750777959824, "learning_rate": 4.908470674334126e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1652 }, { "completion_length": 442.0, "epoch": 0.45814855875831484, "grad_norm": 0.4581425189971924, "kl": 0.09237226098775864, "learning_rate": 4.908353261038174e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1653 }, { "completion_length": 414.25, "epoch": 0.4584257206208426, "grad_norm": 0.0, "kl": 0.12645290791988373, "learning_rate": 4.908235773888046e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1654 }, { "completion_length": 512.0, "epoch": 0.45870288248337027, "grad_norm": 0.36394187808036804, "kl": 0.06911344826221466, "learning_rate": 4.908118212887344e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1655 }, { "completion_length": 383.5, "epoch": 0.458980044345898, "grad_norm": 0.5766294598579407, "kl": 0.08284039795398712, "learning_rate": 4.908000578039675e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1656 }, { "completion_length": 460.25, "epoch": 0.4592572062084257, "grad_norm": 0.0, "kl": 0.07162851095199585, "learning_rate": 4.907882869348644e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1657 }, { "completion_length": 458.25, "epoch": 0.45953436807095344, "grad_norm": 0.501768171787262, "kl": 0.08510696142911911, "learning_rate": 4.9077650868178624e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1658 }, { "completion_length": 485.75, "epoch": 0.4598115299334812, "grad_norm": 0.43226128816604614, "kl": 0.07771185040473938, "learning_rate": 4.907647230450942e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1659 }, { "completion_length": 433.5, "epoch": 0.46008869179600886, "grad_norm": 0.0, "kl": 0.08764266222715378, "learning_rate": 4.907529300251497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1660 }, { "completion_length": 388.75, "epoch": 0.4603658536585366, "grad_norm": 0.0, "kl": 0.07360488921403885, "learning_rate": 4.9074112962231425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1661 }, { "completion_length": 430.75, "epoch": 0.4606430155210643, "grad_norm": 0.0, "kl": 0.0853155106306076, "learning_rate": 4.907293218369499e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1662 }, { "completion_length": 433.0, "epoch": 0.46092017738359203, "grad_norm": 0.5562745332717896, "kl": 0.1093992367386818, "learning_rate": 4.907175066694186e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1663 }, { "completion_length": 428.0, "epoch": 0.4611973392461197, "grad_norm": 0.4059908986091614, "kl": 0.07612624019384384, "learning_rate": 4.907056841200827e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1664 }, { "completion_length": 438.5, "epoch": 0.46147450110864746, "grad_norm": 0.43531009554862976, "kl": 0.10873571783304214, "learning_rate": 4.906938541893048e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1665 }, { "completion_length": 464.0, "epoch": 0.46175166297117515, "grad_norm": 0.4104492962360382, "kl": 0.0702476054430008, "learning_rate": 4.9068201687744774e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1666 }, { "completion_length": 478.25, "epoch": 0.4620288248337029, "grad_norm": 0.38969770073890686, "kl": 0.09035757929086685, "learning_rate": 4.906701721848744e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1667 }, { "completion_length": 453.25, "epoch": 0.4623059866962306, "grad_norm": 0.40516576170921326, "kl": 0.08256322145462036, "learning_rate": 4.906583201119479e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1668 }, { "completion_length": 474.0, "epoch": 0.4625831485587583, "grad_norm": 0.36707961559295654, "kl": 0.07906953990459442, "learning_rate": 4.90646460659032e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1669 }, { "completion_length": 426.0, "epoch": 0.46286031042128606, "grad_norm": 0.0, "kl": 0.06553971767425537, "learning_rate": 4.9063459382649014e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1670 }, { "completion_length": 430.75, "epoch": 0.46313747228381374, "grad_norm": 0.0, "kl": 0.07305721938610077, "learning_rate": 4.906227196146863e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1671 }, { "completion_length": 481.25, "epoch": 0.4634146341463415, "grad_norm": 0.0, "kl": 0.07088493555784225, "learning_rate": 4.9061083802398465e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1672 }, { "completion_length": 528.5, "epoch": 0.4636917960088692, "grad_norm": 0.3764597773551941, "kl": 0.07194501906633377, "learning_rate": 4.905989490547496e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1673 }, { "completion_length": 432.75, "epoch": 0.4639689578713969, "grad_norm": 0.4315732717514038, "kl": 0.07203172892332077, "learning_rate": 4.905870527073455e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1674 }, { "completion_length": 402.5, "epoch": 0.4642461197339246, "grad_norm": 0.4271937608718872, "kl": 0.07309919595718384, "learning_rate": 4.905751489821374e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1675 }, { "completion_length": 437.25, "epoch": 0.46452328159645234, "grad_norm": 0.43916061520576477, "kl": 0.11276346445083618, "learning_rate": 4.905632378794902e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1676 }, { "completion_length": 414.0, "epoch": 0.46480044345898003, "grad_norm": 0.5038041472434998, "kl": 0.08289551734924316, "learning_rate": 4.905513193997692e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1677 }, { "completion_length": 445.25, "epoch": 0.46507760532150777, "grad_norm": 0.3815097510814667, "kl": 0.08632057905197144, "learning_rate": 4.905393935433399e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1678 }, { "completion_length": 451.25, "epoch": 0.46535476718403546, "grad_norm": 0.0, "kl": 0.08134541660547256, "learning_rate": 4.905274603105681e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1679 }, { "completion_length": 486.0, "epoch": 0.4656319290465632, "grad_norm": 0.0, "kl": 0.08366655558347702, "learning_rate": 4.905155197018196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1680 }, { "completion_length": 445.25, "epoch": 0.4659090909090909, "grad_norm": 0.41476550698280334, "kl": 0.06030012667179108, "learning_rate": 4.905035717174607e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1681 }, { "completion_length": 468.25, "epoch": 0.4661862527716186, "grad_norm": 0.39570745825767517, "kl": 0.07517978549003601, "learning_rate": 4.904916163578576e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1682 }, { "completion_length": 413.75, "epoch": 0.46646341463414637, "grad_norm": 0.43591102957725525, "kl": 0.09580521285533905, "learning_rate": 4.904796536233771e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1683 }, { "completion_length": 506.25, "epoch": 0.46674057649667405, "grad_norm": 0.4012725353240967, "kl": 0.05748039484024048, "learning_rate": 4.90467683514386e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1684 }, { "completion_length": 447.0, "epoch": 0.4670177383592018, "grad_norm": 0.3819842040538788, "kl": 0.22686715424060822, "learning_rate": 4.904557060312514e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1685 }, { "completion_length": 472.75, "epoch": 0.4672949002217295, "grad_norm": 0.3974871337413788, "kl": 0.0727102980017662, "learning_rate": 4.904437211743406e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1686 }, { "completion_length": 452.5, "epoch": 0.4675720620842572, "grad_norm": 0.42804667353630066, "kl": 0.07803210616111755, "learning_rate": 4.90431728944021e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1687 }, { "completion_length": 498.5, "epoch": 0.4678492239467849, "grad_norm": 0.38535404205322266, "kl": 0.08760089427232742, "learning_rate": 4.904197293406604e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1688 }, { "completion_length": 438.75, "epoch": 0.46812638580931265, "grad_norm": 0.7454708814620972, "kl": 0.07269416749477386, "learning_rate": 4.9040772236462695e-06, "loss": 0.0, "reward": 2.5, "reward_std": 2.217355728149414, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1689 }, { "completion_length": 527.75, "epoch": 0.46840354767184034, "grad_norm": 0.0, "kl": 0.08158546686172485, "learning_rate": 4.903957080162886e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1690 }, { "completion_length": 489.0, "epoch": 0.4686807095343681, "grad_norm": 0.3246288001537323, "kl": 0.08382849395275116, "learning_rate": 4.903836862960141e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1691 }, { "completion_length": 492.75, "epoch": 0.46895787139689576, "grad_norm": 0.6480025053024292, "kl": 0.08913504332304001, "learning_rate": 4.903716572041718e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1692 }, { "completion_length": 429.75, "epoch": 0.4692350332594235, "grad_norm": 0.3955572843551636, "kl": 0.0860522910952568, "learning_rate": 4.903596207411307e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1693 }, { "completion_length": 489.5, "epoch": 0.4695121951219512, "grad_norm": 0.5094815492630005, "kl": 0.0872674360871315, "learning_rate": 4.903475769072599e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1694 }, { "completion_length": 444.0, "epoch": 0.46978935698447893, "grad_norm": 0.4049544632434845, "kl": 0.0865527018904686, "learning_rate": 4.903355257029286e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1695 }, { "completion_length": 461.0, "epoch": 0.4700665188470067, "grad_norm": 0.0, "kl": 0.08138766884803772, "learning_rate": 4.903234671285067e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1696 }, { "completion_length": 466.0, "epoch": 0.47034368070953436, "grad_norm": 0.4273386597633362, "kl": 0.08034420013427734, "learning_rate": 4.903114011843637e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1697 }, { "completion_length": 479.0, "epoch": 0.4706208425720621, "grad_norm": 0.419124037027359, "kl": 0.0806594043970108, "learning_rate": 4.902993278708698e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1698 }, { "completion_length": 461.25, "epoch": 0.4708980044345898, "grad_norm": 0.0, "kl": 0.09038417041301727, "learning_rate": 4.902872471883949e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1699 }, { "completion_length": 457.0, "epoch": 0.47117516629711753, "grad_norm": 0.41774123907089233, "kl": 0.08455045521259308, "learning_rate": 4.902751591373099e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1700 }, { "completion_length": 378.0, "epoch": 0.4714523281596452, "grad_norm": 0.49712857604026794, "kl": 0.07561414688825607, "learning_rate": 4.9026306371798526e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1701 }, { "completion_length": 484.0, "epoch": 0.47172949002217296, "grad_norm": 0.3918231427669525, "kl": 0.08764111995697021, "learning_rate": 4.902509609307918e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1702 }, { "completion_length": 433.25, "epoch": 0.47200665188470065, "grad_norm": 0.4394415616989136, "kl": 0.10253691673278809, "learning_rate": 4.90238850776101e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1703 }, { "completion_length": 493.75, "epoch": 0.4722838137472284, "grad_norm": 0.0, "kl": 0.07204173505306244, "learning_rate": 4.902267332542838e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1704 }, { "completion_length": 459.75, "epoch": 0.4725609756097561, "grad_norm": 0.3766133785247803, "kl": 0.0881323292851448, "learning_rate": 4.902146083657122e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1705 }, { "completion_length": 455.0, "epoch": 0.4728381374722838, "grad_norm": 0.4622858762741089, "kl": 0.08853214979171753, "learning_rate": 4.902024761107577e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1706 }, { "completion_length": 508.25, "epoch": 0.47311529933481156, "grad_norm": 0.33371710777282715, "kl": 0.09077693521976471, "learning_rate": 4.901903364897926e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1707 }, { "completion_length": 380.25, "epoch": 0.47339246119733924, "grad_norm": 0.46875062584877014, "kl": 0.09229562431573868, "learning_rate": 4.9017818950318886e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1708 }, { "completion_length": 527.5, "epoch": 0.473669623059867, "grad_norm": 0.39241814613342285, "kl": 0.06793490797281265, "learning_rate": 4.9016603515131926e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1709 }, { "completion_length": 438.75, "epoch": 0.47394678492239467, "grad_norm": 0.0, "kl": 0.10124422609806061, "learning_rate": 4.9015387343455646e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1710 }, { "completion_length": 399.0, "epoch": 0.4742239467849224, "grad_norm": 0.4330104887485504, "kl": 0.08719740808010101, "learning_rate": 4.901417043532733e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1711 }, { "completion_length": 430.0, "epoch": 0.4745011086474501, "grad_norm": 0.5180028080940247, "kl": 0.08414454758167267, "learning_rate": 4.901295279078431e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1712 }, { "completion_length": 476.25, "epoch": 0.47477827050997784, "grad_norm": 0.0, "kl": 0.08312702924013138, "learning_rate": 4.901173440986392e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1713 }, { "completion_length": 472.75, "epoch": 0.4750554323725055, "grad_norm": 0.0, "kl": 0.10554109513759613, "learning_rate": 4.901051529260352e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1714 }, { "completion_length": 428.0, "epoch": 0.47533259423503327, "grad_norm": 0.4301515519618988, "kl": 0.10382313281297684, "learning_rate": 4.900929543904049e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1715 }, { "completion_length": 448.75, "epoch": 0.47560975609756095, "grad_norm": 0.3286517858505249, "kl": 0.09243439137935638, "learning_rate": 4.900807484921226e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1716 }, { "completion_length": 420.25, "epoch": 0.4758869179600887, "grad_norm": 0.49274149537086487, "kl": 0.11597573757171631, "learning_rate": 4.900685352315624e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1717 }, { "completion_length": 445.5, "epoch": 0.4761640798226164, "grad_norm": 0.0, "kl": 0.1216328963637352, "learning_rate": 4.900563146090988e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1718 }, { "completion_length": 446.0, "epoch": 0.4764412416851441, "grad_norm": 0.0, "kl": 0.10293858498334885, "learning_rate": 4.900440866251067e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1719 }, { "completion_length": 413.75, "epoch": 0.47671840354767187, "grad_norm": 0.44730517268180847, "kl": 0.09889344125986099, "learning_rate": 4.90031851279961e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1720 }, { "completion_length": 445.5, "epoch": 0.47699556541019955, "grad_norm": 0.4307750463485718, "kl": 0.10904858261346817, "learning_rate": 4.90019608574037e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1721 }, { "completion_length": 454.25, "epoch": 0.4772727272727273, "grad_norm": 0.0, "kl": 0.08516764640808105, "learning_rate": 4.9000735850771e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1722 }, { "completion_length": 394.5, "epoch": 0.477549889135255, "grad_norm": 0.0, "kl": 0.089196115732193, "learning_rate": 4.899951010813557e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1723 }, { "completion_length": 404.75, "epoch": 0.4778270509977827, "grad_norm": 0.0, "kl": 0.09046857059001923, "learning_rate": 4.8998283629535e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1724 }, { "completion_length": 369.25, "epoch": 0.4781042128603104, "grad_norm": 0.467543363571167, "kl": 0.09164103865623474, "learning_rate": 4.899705641500691e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1725 }, { "completion_length": 473.5, "epoch": 0.47838137472283815, "grad_norm": 0.0, "kl": 0.08782867342233658, "learning_rate": 4.899582846458891e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1726 }, { "completion_length": 409.0, "epoch": 0.47865853658536583, "grad_norm": 0.48050612211227417, "kl": 0.08806860446929932, "learning_rate": 4.899459977831869e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1727 }, { "completion_length": 451.0, "epoch": 0.4789356984478936, "grad_norm": 0.3761707842350006, "kl": 0.08394092321395874, "learning_rate": 4.8993370356233904e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1728 }, { "completion_length": 411.0, "epoch": 0.47921286031042126, "grad_norm": 0.46679872274398804, "kl": 0.07869067788124084, "learning_rate": 4.899214019837225e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1729 }, { "completion_length": 381.0, "epoch": 0.479490022172949, "grad_norm": 0.0, "kl": 0.11614537984132767, "learning_rate": 4.899090930477147e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1730 }, { "completion_length": 528.0, "epoch": 0.47976718403547675, "grad_norm": 0.362496554851532, "kl": 0.06729546189308167, "learning_rate": 4.89896776754693e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1731 }, { "completion_length": 432.5, "epoch": 0.48004434589800443, "grad_norm": 0.42638325691223145, "kl": 0.09914625436067581, "learning_rate": 4.8988445310503505e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1732 }, { "completion_length": 391.75, "epoch": 0.4803215077605322, "grad_norm": 0.4811759293079376, "kl": 0.09321407228708267, "learning_rate": 4.898721220991189e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1733 }, { "completion_length": 432.75, "epoch": 0.48059866962305986, "grad_norm": 0.44459155201911926, "kl": 0.07957419753074646, "learning_rate": 4.898597837373226e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1734 }, { "completion_length": 419.25, "epoch": 0.4808758314855876, "grad_norm": 0.563022792339325, "kl": 0.08996603637933731, "learning_rate": 4.898474380200244e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1735 }, { "completion_length": 396.25, "epoch": 0.4811529933481153, "grad_norm": 0.0, "kl": 0.1071963682770729, "learning_rate": 4.898350849476031e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1736 }, { "completion_length": 355.5, "epoch": 0.48143015521064303, "grad_norm": 0.5923029184341431, "kl": 0.11379054188728333, "learning_rate": 4.898227245204374e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1737 }, { "completion_length": 391.25, "epoch": 0.4817073170731707, "grad_norm": 0.4473486840724945, "kl": 0.08859968185424805, "learning_rate": 4.898103567389065e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1738 }, { "completion_length": 417.25, "epoch": 0.48198447893569846, "grad_norm": 0.39877548813819885, "kl": 0.08650372177362442, "learning_rate": 4.897979816033894e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1739 }, { "completion_length": 389.0, "epoch": 0.48226164079822614, "grad_norm": 0.5270780324935913, "kl": 0.14115482568740845, "learning_rate": 4.897855991142658e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1740 }, { "completion_length": 418.25, "epoch": 0.4825388026607539, "grad_norm": 0.5221999287605286, "kl": 0.16417314112186432, "learning_rate": 4.897732092719154e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1741 }, { "completion_length": 482.25, "epoch": 0.48281596452328157, "grad_norm": 0.0, "kl": 0.08089955151081085, "learning_rate": 4.897608120767181e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1742 }, { "completion_length": 482.0, "epoch": 0.4830931263858093, "grad_norm": 0.39135536551475525, "kl": 0.07185870409011841, "learning_rate": 4.89748407529054e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1743 }, { "completion_length": 472.0, "epoch": 0.48337028824833705, "grad_norm": 0.0, "kl": 0.08149999380111694, "learning_rate": 4.897359956293036e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1744 }, { "completion_length": 395.75, "epoch": 0.48364745011086474, "grad_norm": 0.4455694258213043, "kl": 0.08281487226486206, "learning_rate": 4.8972357637784755e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1745 }, { "completion_length": 468.25, "epoch": 0.4839246119733925, "grad_norm": 0.4249611496925354, "kl": 0.09129584580659866, "learning_rate": 4.897111497750665e-06, "loss": -0.0, "reward": 3.0, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1746 }, { "completion_length": 407.0, "epoch": 0.48420177383592017, "grad_norm": 0.4253355860710144, "kl": 0.07078725844621658, "learning_rate": 4.896987158213418e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1747 }, { "completion_length": 464.75, "epoch": 0.4844789356984479, "grad_norm": 0.42958545684814453, "kl": 0.10932318866252899, "learning_rate": 4.896862745170545e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1748 }, { "completion_length": 403.0, "epoch": 0.4847560975609756, "grad_norm": 0.46490517258644104, "kl": 0.11458267271518707, "learning_rate": 4.896738258625863e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1749 }, { "completion_length": 408.5, "epoch": 0.48503325942350334, "grad_norm": 0.0, "kl": 0.08226732164621353, "learning_rate": 4.896613698583188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1750 }, { "completion_length": 428.25, "epoch": 0.485310421286031, "grad_norm": 0.4170202314853668, "kl": 0.07189112901687622, "learning_rate": 4.896489065046341e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1751 }, { "completion_length": 421.25, "epoch": 0.48558758314855877, "grad_norm": 0.4634906053543091, "kl": 0.07947812229394913, "learning_rate": 4.8963643580191446e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1752 }, { "completion_length": 491.0, "epoch": 0.48586474501108645, "grad_norm": 0.0, "kl": 0.07414484769105911, "learning_rate": 4.896239577505421e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1753 }, { "completion_length": 465.5, "epoch": 0.4861419068736142, "grad_norm": 0.4409260153770447, "kl": 0.08071828633546829, "learning_rate": 4.896114723508998e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1754 }, { "completion_length": 448.0, "epoch": 0.4864190687361419, "grad_norm": 0.3820900619029999, "kl": 0.07481700927019119, "learning_rate": 4.8959897960337035e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1755 }, { "completion_length": 535.75, "epoch": 0.4866962305986696, "grad_norm": 0.36553633213043213, "kl": 0.32027488946914673, "learning_rate": 4.8958647950833695e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1756 }, { "completion_length": 514.0, "epoch": 0.48697339246119736, "grad_norm": 0.3749096691608429, "kl": 0.06396996229887009, "learning_rate": 4.895739720661829e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1757 }, { "completion_length": 472.25, "epoch": 0.48725055432372505, "grad_norm": 0.408660352230072, "kl": 0.0707908496260643, "learning_rate": 4.895614572772916e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1758 }, { "completion_length": 338.75, "epoch": 0.4875277161862528, "grad_norm": 0.4710204005241394, "kl": 0.08632003515958786, "learning_rate": 4.8954893514204705e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1759 }, { "completion_length": 336.75, "epoch": 0.4878048780487805, "grad_norm": 0.6189284324645996, "kl": 0.07526957988739014, "learning_rate": 4.895364056608331e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1760 }, { "completion_length": 409.25, "epoch": 0.4880820399113082, "grad_norm": 0.4753383696079254, "kl": 0.14637108147144318, "learning_rate": 4.895238688340341e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1761 }, { "completion_length": 443.5, "epoch": 0.4883592017738359, "grad_norm": 0.5294762253761292, "kl": 0.08242465555667877, "learning_rate": 4.895113246620344e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1762 }, { "completion_length": 513.0, "epoch": 0.48863636363636365, "grad_norm": 0.0, "kl": 0.08698947727680206, "learning_rate": 4.894987731452188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1763 }, { "completion_length": 460.0, "epoch": 0.48891352549889133, "grad_norm": 0.34248146414756775, "kl": 0.07512567192316055, "learning_rate": 4.89486214283972e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1764 }, { "completion_length": 431.5, "epoch": 0.4891906873614191, "grad_norm": 0.3923366963863373, "kl": 0.10650478303432465, "learning_rate": 4.894736480786793e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1765 }, { "completion_length": 423.25, "epoch": 0.48946784922394676, "grad_norm": 0.0, "kl": 0.07111692428588867, "learning_rate": 4.89461074529726e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1766 }, { "completion_length": 444.75, "epoch": 0.4897450110864745, "grad_norm": 0.39580053091049194, "kl": 0.08897905051708221, "learning_rate": 4.894484936374977e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1767 }, { "completion_length": 446.75, "epoch": 0.49002217294900224, "grad_norm": 0.4018922746181488, "kl": 0.08205412328243256, "learning_rate": 4.894359054023802e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1768 }, { "completion_length": 473.75, "epoch": 0.49029933481152993, "grad_norm": 0.38307949900627136, "kl": 0.07663699984550476, "learning_rate": 4.894233098247594e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1769 }, { "completion_length": 438.5, "epoch": 0.49057649667405767, "grad_norm": 0.0, "kl": 0.07502584159374237, "learning_rate": 4.894107069050218e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1770 }, { "completion_length": 410.75, "epoch": 0.49085365853658536, "grad_norm": 0.0, "kl": 0.08151350170373917, "learning_rate": 4.8939809664355366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1771 }, { "completion_length": 400.25, "epoch": 0.4911308203991131, "grad_norm": 0.0, "kl": 0.07090267539024353, "learning_rate": 4.893854790407417e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1772 }, { "completion_length": 437.0, "epoch": 0.4914079822616408, "grad_norm": 0.40343746542930603, "kl": 0.06625537574291229, "learning_rate": 4.89372854096973e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1773 }, { "completion_length": 399.25, "epoch": 0.4916851441241685, "grad_norm": 0.40586742758750916, "kl": 0.22321371734142303, "learning_rate": 4.893602218126346e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1774 }, { "completion_length": 459.0, "epoch": 0.4919623059866962, "grad_norm": 0.4204888343811035, "kl": 0.07863927632570267, "learning_rate": 4.89347582188114e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1775 }, { "completion_length": 416.5, "epoch": 0.49223946784922396, "grad_norm": 0.7331802248954773, "kl": 0.07064196467399597, "learning_rate": 4.893349352237986e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1776 }, { "completion_length": 343.25, "epoch": 0.49251662971175164, "grad_norm": 0.47022753953933716, "kl": 0.11522483080625534, "learning_rate": 4.893222809200765e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1777 }, { "completion_length": 406.75, "epoch": 0.4927937915742794, "grad_norm": 0.4521123468875885, "kl": 0.07704486697912216, "learning_rate": 4.893096192773354e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1778 }, { "completion_length": 414.0, "epoch": 0.49307095343680707, "grad_norm": 0.0, "kl": 0.07653795182704926, "learning_rate": 4.892969502959639e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1779 }, { "completion_length": 472.75, "epoch": 0.4933481152993348, "grad_norm": 0.4532244801521301, "kl": 0.07857924699783325, "learning_rate": 4.892842739763504e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1780 }, { "completion_length": 393.25, "epoch": 0.49362527716186255, "grad_norm": 0.4297858476638794, "kl": 0.07113970071077347, "learning_rate": 4.892715903188836e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1781 }, { "completion_length": 470.25, "epoch": 0.49390243902439024, "grad_norm": 0.3629225194454193, "kl": 0.07957149296998978, "learning_rate": 4.8925889932395246e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1782 }, { "completion_length": 407.5, "epoch": 0.494179600886918, "grad_norm": 0.36326920986175537, "kl": 0.07178173959255219, "learning_rate": 4.892462009919461e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1783 }, { "completion_length": 440.5, "epoch": 0.49445676274944567, "grad_norm": 0.419808954000473, "kl": 0.08413983136415482, "learning_rate": 4.89233495323254e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1784 }, { "completion_length": 448.25, "epoch": 0.4947339246119734, "grad_norm": 0.4254230260848999, "kl": 0.09443847835063934, "learning_rate": 4.892207823182659e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1785 }, { "completion_length": 466.75, "epoch": 0.4950110864745011, "grad_norm": 0.39924222230911255, "kl": 0.06326138973236084, "learning_rate": 4.892080619773715e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1786 }, { "completion_length": 465.75, "epoch": 0.49528824833702884, "grad_norm": 0.4082907736301422, "kl": 0.07055123895406723, "learning_rate": 4.8919533430096085e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1787 }, { "completion_length": 436.75, "epoch": 0.4955654101995565, "grad_norm": 0.4112588167190552, "kl": 0.07909306138753891, "learning_rate": 4.891825992894244e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1788 }, { "completion_length": 414.25, "epoch": 0.49584257206208426, "grad_norm": 0.4311828017234802, "kl": 0.08997313678264618, "learning_rate": 4.8916985694315255e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1789 }, { "completion_length": 438.75, "epoch": 0.49611973392461195, "grad_norm": 0.0, "kl": 0.06553731113672256, "learning_rate": 4.891571072625362e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1790 }, { "completion_length": 406.75, "epoch": 0.4963968957871397, "grad_norm": 0.46444499492645264, "kl": 0.06760590523481369, "learning_rate": 4.891443502479661e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1791 }, { "completion_length": 410.75, "epoch": 0.49667405764966743, "grad_norm": 0.4163588285446167, "kl": 0.06881792843341827, "learning_rate": 4.891315858998337e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1792 }, { "completion_length": 429.5, "epoch": 0.4969512195121951, "grad_norm": 0.3874480128288269, "kl": 0.11495273560285568, "learning_rate": 4.891188142185304e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1793 }, { "completion_length": 439.75, "epoch": 0.49722838137472286, "grad_norm": 0.0, "kl": 0.06773148477077484, "learning_rate": 4.891060352044476e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1794 }, { "completion_length": 465.25, "epoch": 0.49750554323725055, "grad_norm": 0.0, "kl": 0.06765516847372055, "learning_rate": 4.890932488579775e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1795 }, { "completion_length": 420.25, "epoch": 0.4977827050997783, "grad_norm": 0.0, "kl": 0.08021818846464157, "learning_rate": 4.890804551795119e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1796 }, { "completion_length": 412.25, "epoch": 0.498059866962306, "grad_norm": 0.4910096824169159, "kl": 0.09604600816965103, "learning_rate": 4.890676541694433e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1797 }, { "completion_length": 340.75, "epoch": 0.4983370288248337, "grad_norm": 0.0, "kl": 0.07544931024312973, "learning_rate": 4.890548458281645e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1798 }, { "completion_length": 457.0, "epoch": 0.4986141906873614, "grad_norm": 0.3571500778198242, "kl": 0.07308022677898407, "learning_rate": 4.890420301560678e-06, "loss": 0.0, "reward": 2.5, "reward_std": 2.217355728149414, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1799 }, { "completion_length": 432.75, "epoch": 0.49889135254988914, "grad_norm": 0.0, "kl": 0.06658948957920074, "learning_rate": 4.890292071535464e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1800 }, { "completion_length": 375.5, "epoch": 0.49916851441241683, "grad_norm": 0.4632571339607239, "kl": 0.10122938454151154, "learning_rate": 4.890163768209937e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1801 }, { "completion_length": 423.5, "epoch": 0.49944567627494457, "grad_norm": 0.4490313231945038, "kl": 0.07465866208076477, "learning_rate": 4.890035391588029e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1802 }, { "completion_length": 442.0, "epoch": 0.49972283813747226, "grad_norm": 0.0, "kl": 0.35233378410339355, "learning_rate": 4.8899069416736775e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1803 }, { "completion_length": 461.25, "epoch": 0.5, "grad_norm": 0.0, "kl": 0.06523491442203522, "learning_rate": 4.889778418470823e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1804 }, { "completion_length": 373.0, "epoch": 0.5002771618625277, "grad_norm": 0.3811831474304199, "kl": 0.10954444110393524, "learning_rate": 4.889649821983405e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1805 }, { "completion_length": 454.5, "epoch": 0.5005543237250555, "grad_norm": 0.3551279306411743, "kl": 0.05940881371498108, "learning_rate": 4.889521152215368e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1806 }, { "completion_length": 367.75, "epoch": 0.5008314855875832, "grad_norm": 0.0, "kl": 0.07391712814569473, "learning_rate": 4.889392409170657e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1807 }, { "completion_length": 437.0, "epoch": 0.5011086474501109, "grad_norm": 0.44770318269729614, "kl": 0.07630370557308197, "learning_rate": 4.8892635928532205e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1808 }, { "completion_length": 408.0, "epoch": 0.5013858093126385, "grad_norm": 0.0, "kl": 0.07417575269937515, "learning_rate": 4.889134703267009e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1809 }, { "completion_length": 351.25, "epoch": 0.5016629711751663, "grad_norm": 0.5454084873199463, "kl": 0.08086635917425156, "learning_rate": 4.889005740415975e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1810 }, { "completion_length": 452.0, "epoch": 0.501940133037694, "grad_norm": 0.42424651980400085, "kl": 0.11434409767389297, "learning_rate": 4.888876704304072e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1811 }, { "completion_length": 427.25, "epoch": 0.5022172949002217, "grad_norm": 0.3920648694038391, "kl": 0.07132618874311447, "learning_rate": 4.888747594935259e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1812 }, { "completion_length": 457.75, "epoch": 0.5024944567627494, "grad_norm": 0.4836595058441162, "kl": 0.12381377071142197, "learning_rate": 4.888618412313493e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1813 }, { "completion_length": 444.5, "epoch": 0.5027716186252772, "grad_norm": 0.3787281811237335, "kl": 0.05944032967090607, "learning_rate": 4.888489156442737e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1814 }, { "completion_length": 400.0, "epoch": 0.5030487804878049, "grad_norm": 0.3998754918575287, "kl": 0.08465702086687088, "learning_rate": 4.888359827326955e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1815 }, { "completion_length": 403.25, "epoch": 0.5033259423503326, "grad_norm": 0.50962233543396, "kl": 0.06972727924585342, "learning_rate": 4.888230424970113e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1816 }, { "completion_length": 414.25, "epoch": 0.5036031042128604, "grad_norm": 0.3621780276298523, "kl": 0.06841617822647095, "learning_rate": 4.888100949376178e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1817 }, { "completion_length": 435.0, "epoch": 0.503880266075388, "grad_norm": 0.5075739622116089, "kl": 0.07789444178342819, "learning_rate": 4.8879714005491205e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1818 }, { "completion_length": 435.5, "epoch": 0.5041574279379157, "grad_norm": 0.0, "kl": 0.08982973545789719, "learning_rate": 4.8878417784929145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1819 }, { "completion_length": 417.0, "epoch": 0.5044345898004434, "grad_norm": 0.45573723316192627, "kl": 0.08469003438949585, "learning_rate": 4.887712083211534e-06, "loss": 0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1820 }, { "completion_length": 433.0, "epoch": 0.5047117516629712, "grad_norm": 0.39635780453681946, "kl": 0.07387454807758331, "learning_rate": 4.887582314708958e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1821 }, { "completion_length": 436.75, "epoch": 0.5049889135254989, "grad_norm": 0.41004085540771484, "kl": 0.08934321999549866, "learning_rate": 4.887452472989162e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1822 }, { "completion_length": 436.0, "epoch": 0.5052660753880266, "grad_norm": 0.3965754508972168, "kl": 0.08382335305213928, "learning_rate": 4.887322558056132e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1823 }, { "completion_length": 403.5, "epoch": 0.5055432372505543, "grad_norm": 0.399112731218338, "kl": 0.07583162933588028, "learning_rate": 4.8871925699138495e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1824 }, { "completion_length": 455.0, "epoch": 0.5058203991130821, "grad_norm": 0.45115289092063904, "kl": 0.0833466649055481, "learning_rate": 4.887062508566302e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1825 }, { "completion_length": 399.5, "epoch": 0.5060975609756098, "grad_norm": 0.0, "kl": 0.07495877891778946, "learning_rate": 4.8869323740174765e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1826 }, { "completion_length": 428.75, "epoch": 0.5063747228381374, "grad_norm": 0.0, "kl": 0.06731050461530685, "learning_rate": 4.886802166271365e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1827 }, { "completion_length": 371.75, "epoch": 0.5066518847006651, "grad_norm": 0.0, "kl": 0.08602682501077652, "learning_rate": 4.8866718853319595e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1828 }, { "completion_length": 634.75, "epoch": 0.5069290465631929, "grad_norm": 0.37800854444503784, "kl": 0.08885493129491806, "learning_rate": 4.8865415312032554e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1829 }, { "completion_length": 430.0, "epoch": 0.5072062084257206, "grad_norm": 0.4543614089488983, "kl": 0.06791539490222931, "learning_rate": 4.88641110388925e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1830 }, { "completion_length": 461.75, "epoch": 0.5074833702882483, "grad_norm": 0.4326775372028351, "kl": 0.0699978694319725, "learning_rate": 4.886280603393945e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1831 }, { "completion_length": 496.5, "epoch": 0.5077605321507761, "grad_norm": 0.37193307280540466, "kl": 0.05535607412457466, "learning_rate": 4.886150029721339e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1832 }, { "completion_length": 404.25, "epoch": 0.5080376940133038, "grad_norm": 0.0, "kl": 0.07954035699367523, "learning_rate": 4.886019382875439e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1833 }, { "completion_length": 394.5, "epoch": 0.5083148558758315, "grad_norm": 0.43016859889030457, "kl": 0.06513276696205139, "learning_rate": 4.885888662860249e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1834 }, { "completion_length": 444.75, "epoch": 0.5085920177383592, "grad_norm": 0.515138566493988, "kl": 0.07061262428760529, "learning_rate": 4.885757869679779e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1835 }, { "completion_length": 392.5, "epoch": 0.508869179600887, "grad_norm": 0.0, "kl": 0.0871676430106163, "learning_rate": 4.885627003338039e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1836 }, { "completion_length": 394.5, "epoch": 0.5091463414634146, "grad_norm": 0.36578622460365295, "kl": 0.06697401404380798, "learning_rate": 4.8854960638390444e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1837 }, { "completion_length": 379.75, "epoch": 0.5094235033259423, "grad_norm": 0.34717291593551636, "kl": 0.09485981613397598, "learning_rate": 4.885365051186807e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1838 }, { "completion_length": 421.25, "epoch": 0.50970066518847, "grad_norm": 0.0, "kl": 0.08487318456172943, "learning_rate": 4.885233965385349e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1839 }, { "completion_length": 422.0, "epoch": 0.5099778270509978, "grad_norm": 0.4464085102081299, "kl": 0.09998074918985367, "learning_rate": 4.885102806438685e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1840 }, { "completion_length": 440.5, "epoch": 0.5102549889135255, "grad_norm": 0.45259764790534973, "kl": 0.08697021007537842, "learning_rate": 4.8849715743508405e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1841 }, { "completion_length": 436.25, "epoch": 0.5105321507760532, "grad_norm": 0.0, "kl": 0.07921171188354492, "learning_rate": 4.88484026912584e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1842 }, { "completion_length": 402.75, "epoch": 0.510809312638581, "grad_norm": 0.0, "kl": 0.09917541593313217, "learning_rate": 4.884708890767709e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1843 }, { "completion_length": 421.0, "epoch": 0.5110864745011087, "grad_norm": 0.0, "kl": 0.07622380554676056, "learning_rate": 4.884577439280475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1844 }, { "completion_length": 382.5, "epoch": 0.5113636363636364, "grad_norm": 0.4307716190814972, "kl": 0.0855272188782692, "learning_rate": 4.8844459146681714e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1845 }, { "completion_length": 434.25, "epoch": 0.511640798226164, "grad_norm": 0.47950589656829834, "kl": 0.07491152733564377, "learning_rate": 4.88431431693483e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1846 }, { "completion_length": 344.25, "epoch": 0.5119179600886918, "grad_norm": 0.0, "kl": 0.07997728139162064, "learning_rate": 4.884182646084489e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1847 }, { "completion_length": 397.25, "epoch": 0.5121951219512195, "grad_norm": 0.39082977175712585, "kl": 0.07062537223100662, "learning_rate": 4.884050902121182e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1848 }, { "completion_length": 376.25, "epoch": 0.5124722838137472, "grad_norm": 0.0, "kl": 0.07340371608734131, "learning_rate": 4.883919085048953e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1849 }, { "completion_length": 389.0, "epoch": 0.5127494456762749, "grad_norm": 0.4232270419597626, "kl": 0.07718522101640701, "learning_rate": 4.883787194871841e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1850 }, { "completion_length": 426.75, "epoch": 0.5130266075388027, "grad_norm": 0.0, "kl": 0.07244522869586945, "learning_rate": 4.883655231593893e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1851 }, { "completion_length": 422.5, "epoch": 0.5133037694013304, "grad_norm": 0.0, "kl": 0.11390174180269241, "learning_rate": 4.883523195219154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1852 }, { "completion_length": 380.5, "epoch": 0.5135809312638581, "grad_norm": 0.4513089060783386, "kl": 0.07733555883169174, "learning_rate": 4.883391085751674e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1853 }, { "completion_length": 343.25, "epoch": 0.5138580931263859, "grad_norm": 0.0, "kl": 0.06541575491428375, "learning_rate": 4.8832589031955045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1854 }, { "completion_length": 335.0, "epoch": 0.5141352549889135, "grad_norm": 0.0, "kl": 0.08530468493700027, "learning_rate": 4.883126647554699e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1855 }, { "completion_length": 346.0, "epoch": 0.5144124168514412, "grad_norm": 0.5167350769042969, "kl": 0.07563317567110062, "learning_rate": 4.8829943188333115e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1856 }, { "completion_length": 362.75, "epoch": 0.5146895787139689, "grad_norm": 0.570594310760498, "kl": 0.09759710729122162, "learning_rate": 4.882861917035403e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1857 }, { "completion_length": 380.25, "epoch": 0.5149667405764967, "grad_norm": 0.4013983905315399, "kl": 0.07200059294700623, "learning_rate": 4.8827294421650305e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1858 }, { "completion_length": 437.25, "epoch": 0.5152439024390244, "grad_norm": 0.38878560066223145, "kl": 0.06663485616445541, "learning_rate": 4.882596894226258e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1859 }, { "completion_length": 344.25, "epoch": 0.5155210643015521, "grad_norm": 0.4839397072792053, "kl": 0.08400518447160721, "learning_rate": 4.8824642732231506e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1860 }, { "completion_length": 509.75, "epoch": 0.5157982261640798, "grad_norm": 0.3644596338272095, "kl": 0.12624937295913696, "learning_rate": 4.882331579159775e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1861 }, { "completion_length": 380.75, "epoch": 0.5160753880266076, "grad_norm": 0.0, "kl": 0.07659128308296204, "learning_rate": 4.882198812040199e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1862 }, { "completion_length": 365.0, "epoch": 0.5163525498891353, "grad_norm": 0.44713178277015686, "kl": 0.0897984728217125, "learning_rate": 4.882065971868496e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1863 }, { "completion_length": 392.25, "epoch": 0.516629711751663, "grad_norm": 0.3765386939048767, "kl": 0.07943381369113922, "learning_rate": 4.881933058648739e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1864 }, { "completion_length": 460.0, "epoch": 0.5169068736141907, "grad_norm": 0.0, "kl": 0.08193813264369965, "learning_rate": 4.8818000723850026e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1865 }, { "completion_length": 410.25, "epoch": 0.5171840354767184, "grad_norm": 0.4112558960914612, "kl": 0.07238113135099411, "learning_rate": 4.881667013081367e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1866 }, { "completion_length": 357.5, "epoch": 0.5174611973392461, "grad_norm": 0.0, "kl": 0.08554670959711075, "learning_rate": 4.881533880741911e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1867 }, { "completion_length": 447.0, "epoch": 0.5177383592017738, "grad_norm": 0.767349362373352, "kl": 0.06965171545743942, "learning_rate": 4.881400675370719e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1868 }, { "completion_length": 381.0, "epoch": 0.5180155210643016, "grad_norm": 0.43162333965301514, "kl": 0.08727413415908813, "learning_rate": 4.881267396971874e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1869 }, { "completion_length": 358.0, "epoch": 0.5182926829268293, "grad_norm": 0.0, "kl": 0.08518964797258377, "learning_rate": 4.881134045549463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1870 }, { "completion_length": 345.0, "epoch": 0.518569844789357, "grad_norm": 0.4593121111392975, "kl": 0.09667008370161057, "learning_rate": 4.8810006211075765e-06, "loss": -0.0, "reward": 5.0, "reward_std": 1.5, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1871 }, { "completion_length": 346.5, "epoch": 0.5188470066518847, "grad_norm": 0.5354191064834595, "kl": 0.09037494659423828, "learning_rate": 4.880867123650306e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1872 }, { "completion_length": 354.0, "epoch": 0.5191241685144125, "grad_norm": 0.0, "kl": 0.09465427696704865, "learning_rate": 4.880733553181744e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1873 }, { "completion_length": 402.25, "epoch": 0.5194013303769401, "grad_norm": 0.40110695362091064, "kl": 0.08618547767400742, "learning_rate": 4.880599909705988e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1874 }, { "completion_length": 415.25, "epoch": 0.5196784922394678, "grad_norm": 0.47078463435173035, "kl": 0.08661483973264694, "learning_rate": 4.880466193227137e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1875 }, { "completion_length": 360.5, "epoch": 0.5199556541019955, "grad_norm": 0.45930105447769165, "kl": 0.11225612461566925, "learning_rate": 4.880332403749289e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1876 }, { "completion_length": 425.0, "epoch": 0.5202328159645233, "grad_norm": 0.41697925329208374, "kl": 0.07872477918863297, "learning_rate": 4.8801985412765495e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1877 }, { "completion_length": 475.75, "epoch": 0.520509977827051, "grad_norm": 0.33214402198791504, "kl": 0.06657418608665466, "learning_rate": 4.880064605813021e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1878 }, { "completion_length": 419.25, "epoch": 0.5207871396895787, "grad_norm": 0.0, "kl": 0.08917641639709473, "learning_rate": 4.879930597362812e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1879 }, { "completion_length": 418.75, "epoch": 0.5210643015521065, "grad_norm": 0.4050862193107605, "kl": 0.09575168788433075, "learning_rate": 4.879796515930032e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1880 }, { "completion_length": 383.0, "epoch": 0.5213414634146342, "grad_norm": 0.0, "kl": 0.06706555932760239, "learning_rate": 4.879662361518793e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1881 }, { "completion_length": 421.0, "epoch": 0.5216186252771619, "grad_norm": 0.3612154424190521, "kl": 0.07600496709346771, "learning_rate": 4.879528134133208e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1882 }, { "completion_length": 368.25, "epoch": 0.5218957871396895, "grad_norm": 0.0, "kl": 0.09385603666305542, "learning_rate": 4.8793938337773935e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1883 }, { "completion_length": 414.75, "epoch": 0.5221729490022173, "grad_norm": 0.44347643852233887, "kl": 0.06826627999544144, "learning_rate": 4.879259460455468e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1884 }, { "completion_length": 339.75, "epoch": 0.522450110864745, "grad_norm": 0.5266106128692627, "kl": 0.09628083556890488, "learning_rate": 4.879125014171553e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1885 }, { "completion_length": 381.0, "epoch": 0.5227272727272727, "grad_norm": 0.4589262008666992, "kl": 0.10815295577049255, "learning_rate": 4.878990494929771e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1886 }, { "completion_length": 415.25, "epoch": 0.5230044345898004, "grad_norm": 0.0, "kl": 0.08606857061386108, "learning_rate": 4.878855902734246e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1887 }, { "completion_length": 492.5, "epoch": 0.5232815964523282, "grad_norm": 0.0, "kl": 0.08597104996442795, "learning_rate": 4.878721237589107e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1888 }, { "completion_length": 445.5, "epoch": 0.5235587583148559, "grad_norm": 0.38277435302734375, "kl": 0.1111965924501419, "learning_rate": 4.878586499498482e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1889 }, { "completion_length": 411.25, "epoch": 0.5238359201773836, "grad_norm": 0.4087737202644348, "kl": 0.07307419180870056, "learning_rate": 4.878451688466504e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1890 }, { "completion_length": 363.5, "epoch": 0.5241130820399114, "grad_norm": 0.0, "kl": 0.1116340383887291, "learning_rate": 4.878316804497307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1891 }, { "completion_length": 361.5, "epoch": 0.524390243902439, "grad_norm": 0.4921362102031708, "kl": 0.10489638149738312, "learning_rate": 4.878181847595027e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1892 }, { "completion_length": 427.0, "epoch": 0.5246674057649667, "grad_norm": 0.3954429030418396, "kl": 0.10339000821113586, "learning_rate": 4.878046817763803e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1893 }, { "completion_length": 399.0, "epoch": 0.5249445676274944, "grad_norm": 0.0, "kl": 0.14513225853443146, "learning_rate": 4.8779117150077755e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1894 }, { "completion_length": 468.0, "epoch": 0.5252217294900222, "grad_norm": 0.4653528034687042, "kl": 0.0705815777182579, "learning_rate": 4.877776539331087e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1895 }, { "completion_length": 386.25, "epoch": 0.5254988913525499, "grad_norm": 0.446467787027359, "kl": 0.09294987469911575, "learning_rate": 4.8776412907378845e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1896 }, { "completion_length": 369.75, "epoch": 0.5257760532150776, "grad_norm": 0.0, "kl": 0.08483540266752243, "learning_rate": 4.8775059692323134e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1897 }, { "completion_length": 482.0, "epoch": 0.5260532150776053, "grad_norm": 0.0, "kl": 0.06903933733701706, "learning_rate": 4.8773705748185255e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1898 }, { "completion_length": 391.75, "epoch": 0.5263303769401331, "grad_norm": 0.4110167622566223, "kl": 0.0991964340209961, "learning_rate": 4.87723510750067e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1899 }, { "completion_length": 395.5, "epoch": 0.5266075388026608, "grad_norm": 0.4688778519630432, "kl": 0.08005868643522263, "learning_rate": 4.877099567282903e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1900 }, { "completion_length": 458.0, "epoch": 0.5268847006651884, "grad_norm": 0.5132694840431213, "kl": 0.08012610673904419, "learning_rate": 4.876963954169382e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1901 }, { "completion_length": 379.75, "epoch": 0.5271618625277162, "grad_norm": 0.0, "kl": 0.08471062034368515, "learning_rate": 4.876828268164264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1902 }, { "completion_length": 422.75, "epoch": 0.5274390243902439, "grad_norm": 0.4146480858325958, "kl": 0.08963015675544739, "learning_rate": 4.8766925092717105e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1903 }, { "completion_length": 415.5, "epoch": 0.5277161862527716, "grad_norm": 0.39680215716362, "kl": 0.08834671229124069, "learning_rate": 4.876556677495885e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1904 }, { "completion_length": 463.75, "epoch": 0.5279933481152993, "grad_norm": 0.40220481157302856, "kl": 0.11931157112121582, "learning_rate": 4.87642077284095e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1905 }, { "completion_length": 351.25, "epoch": 0.5282705099778271, "grad_norm": 0.44119805097579956, "kl": 0.07403586804866791, "learning_rate": 4.876284795311078e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1906 }, { "completion_length": 471.25, "epoch": 0.5285476718403548, "grad_norm": 0.0, "kl": 0.06645036488771439, "learning_rate": 4.876148744910436e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1907 }, { "completion_length": 387.75, "epoch": 0.5288248337028825, "grad_norm": 0.4945962429046631, "kl": 0.08540220558643341, "learning_rate": 4.8760126216431965e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1908 }, { "completion_length": 425.25, "epoch": 0.5291019955654102, "grad_norm": 0.4456801116466522, "kl": 0.0887686088681221, "learning_rate": 4.875876425513534e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1909 }, { "completion_length": 448.25, "epoch": 0.529379157427938, "grad_norm": 0.3485621213912964, "kl": 0.06553196161985397, "learning_rate": 4.875740156525624e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1910 }, { "completion_length": 373.75, "epoch": 0.5296563192904656, "grad_norm": 0.4891012907028198, "kl": 0.08405601978302002, "learning_rate": 4.875603814683647e-06, "loss": 0.0, "reward": 4.34375, "reward_std": 2.8125, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 1911 }, { "completion_length": 460.0, "epoch": 0.5299334811529933, "grad_norm": 0.38949233293533325, "kl": 0.07343627512454987, "learning_rate": 4.8754673999917826e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1912 }, { "completion_length": 394.5, "epoch": 0.530210643015521, "grad_norm": 0.48726269602775574, "kl": 0.09934384375810623, "learning_rate": 4.8753309124542155e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1913 }, { "completion_length": 476.75, "epoch": 0.5304878048780488, "grad_norm": 0.3913431763648987, "kl": 0.08382825553417206, "learning_rate": 4.87519435207513e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1914 }, { "completion_length": 486.75, "epoch": 0.5307649667405765, "grad_norm": 0.3706984519958496, "kl": 0.08737825602293015, "learning_rate": 4.875057718858715e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1915 }, { "completion_length": 444.0, "epoch": 0.5310421286031042, "grad_norm": 0.3894558250904083, "kl": 0.12369577586650848, "learning_rate": 4.874921012809159e-06, "loss": 0.0, "reward": 2.625, "reward_std": 2.0966243743896484, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 1916 }, { "completion_length": 431.0, "epoch": 0.531319290465632, "grad_norm": 0.0, "kl": 0.07848858833312988, "learning_rate": 4.874784233930655e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1917 }, { "completion_length": 420.5, "epoch": 0.5315964523281597, "grad_norm": 0.43787193298339844, "kl": 0.08044205605983734, "learning_rate": 4.874647382227398e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1918 }, { "completion_length": 404.5, "epoch": 0.5318736141906873, "grad_norm": 0.0, "kl": 0.08897995948791504, "learning_rate": 4.874510457703583e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1919 }, { "completion_length": 376.5, "epoch": 0.532150776053215, "grad_norm": 0.4287278652191162, "kl": 0.09204726666212082, "learning_rate": 4.874373460363411e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1920 }, { "completion_length": 417.5, "epoch": 0.5324279379157428, "grad_norm": 0.0, "kl": 0.06981699913740158, "learning_rate": 4.874236390211081e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1921 }, { "completion_length": 391.5, "epoch": 0.5327050997782705, "grad_norm": 0.4314778447151184, "kl": 0.08244095742702484, "learning_rate": 4.874099247250799e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1922 }, { "completion_length": 421.0, "epoch": 0.5329822616407982, "grad_norm": 0.42522355914115906, "kl": 0.07750073075294495, "learning_rate": 4.873962031486768e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1923 }, { "completion_length": 435.75, "epoch": 0.5332594235033259, "grad_norm": 0.43060773611068726, "kl": 0.08799009025096893, "learning_rate": 4.873824742923197e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1924 }, { "completion_length": 433.25, "epoch": 0.5335365853658537, "grad_norm": 0.0, "kl": 0.08053521066904068, "learning_rate": 4.873687381564296e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1925 }, { "completion_length": 414.0, "epoch": 0.5338137472283814, "grad_norm": 0.0, "kl": 0.11584892123937607, "learning_rate": 4.873549947414278e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1926 }, { "completion_length": 420.25, "epoch": 0.5340909090909091, "grad_norm": 0.0, "kl": 0.12684528529644012, "learning_rate": 4.8734124404773565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1927 }, { "completion_length": 438.0, "epoch": 0.5343680709534369, "grad_norm": 0.44325944781303406, "kl": 0.08925087749958038, "learning_rate": 4.873274860757748e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1928 }, { "completion_length": 443.5, "epoch": 0.5346452328159645, "grad_norm": 0.3352181613445282, "kl": 0.0843968465924263, "learning_rate": 4.873137208259673e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1929 }, { "completion_length": 426.75, "epoch": 0.5349223946784922, "grad_norm": 0.385917603969574, "kl": 0.09251684695482254, "learning_rate": 4.872999482987352e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1930 }, { "completion_length": 372.75, "epoch": 0.5351995565410199, "grad_norm": 0.4358251988887787, "kl": 0.11615962535142899, "learning_rate": 4.8728616849450074e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1931 }, { "completion_length": 410.5, "epoch": 0.5354767184035477, "grad_norm": 0.0, "kl": 0.08235788345336914, "learning_rate": 4.872723814136866e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1932 }, { "completion_length": 408.0, "epoch": 0.5357538802660754, "grad_norm": 0.0, "kl": 0.08858726918697357, "learning_rate": 4.872585870567155e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1933 }, { "completion_length": 392.25, "epoch": 0.5360310421286031, "grad_norm": 0.5553476810455322, "kl": 0.09801026433706284, "learning_rate": 4.872447854240106e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1934 }, { "completion_length": 383.0, "epoch": 0.5363082039911308, "grad_norm": 0.0, "kl": 0.08635705709457397, "learning_rate": 4.87230976515995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1935 }, { "completion_length": 440.5, "epoch": 0.5365853658536586, "grad_norm": 0.0, "kl": 0.09075602889060974, "learning_rate": 4.8721716033309215e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1936 }, { "completion_length": 399.75, "epoch": 0.5368625277161863, "grad_norm": 0.0, "kl": 0.0683232843875885, "learning_rate": 4.872033368757259e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1937 }, { "completion_length": 496.0, "epoch": 0.5371396895787139, "grad_norm": 0.0, "kl": 0.06623586267232895, "learning_rate": 4.8718950614432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1938 }, { "completion_length": 429.5, "epoch": 0.5374168514412417, "grad_norm": 0.0, "kl": 0.08855603635311127, "learning_rate": 4.871756681392986e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1939 }, { "completion_length": 469.75, "epoch": 0.5376940133037694, "grad_norm": 0.3789716362953186, "kl": 0.07766864448785782, "learning_rate": 4.871618228610861e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1940 }, { "completion_length": 404.5, "epoch": 0.5379711751662971, "grad_norm": 0.0, "kl": 0.09138502925634384, "learning_rate": 4.871479703101072e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1941 }, { "completion_length": 397.5, "epoch": 0.5382483370288248, "grad_norm": 0.0, "kl": 0.09720438718795776, "learning_rate": 4.8713411048678635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1942 }, { "completion_length": 389.0, "epoch": 0.5385254988913526, "grad_norm": 0.3745558559894562, "kl": 0.0820767879486084, "learning_rate": 4.871202433915489e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1943 }, { "completion_length": 445.5, "epoch": 0.5388026607538803, "grad_norm": 0.43897953629493713, "kl": 0.09053201228380203, "learning_rate": 4.8710636902482e-06, "loss": -0.0, "reward": 5.59375, "reward_std": 0.3125, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 1944 }, { "completion_length": 364.0, "epoch": 0.539079822616408, "grad_norm": 0.44565120339393616, "kl": 0.0873863697052002, "learning_rate": 4.870924873870251e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1945 }, { "completion_length": 411.25, "epoch": 0.5393569844789357, "grad_norm": 0.3765197694301605, "kl": 0.08508104085922241, "learning_rate": 4.870785984785898e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1946 }, { "completion_length": 417.25, "epoch": 0.5396341463414634, "grad_norm": 0.3973635137081146, "kl": 0.08006928861141205, "learning_rate": 4.870647022999402e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1947 }, { "completion_length": 372.5, "epoch": 0.5399113082039911, "grad_norm": 0.39653587341308594, "kl": 0.0750313252210617, "learning_rate": 4.870507988515023e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1948 }, { "completion_length": 452.0, "epoch": 0.5401884700665188, "grad_norm": 0.6119776964187622, "kl": 0.0752279981970787, "learning_rate": 4.870368881337025e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1949 }, { "completion_length": 396.0, "epoch": 0.5404656319290465, "grad_norm": 0.3724626302719116, "kl": 0.09163210541009903, "learning_rate": 4.870229701469674e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1950 }, { "completion_length": 364.0, "epoch": 0.5407427937915743, "grad_norm": 0.5438361167907715, "kl": 0.08536563068628311, "learning_rate": 4.870090448917238e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1951 }, { "completion_length": 429.5, "epoch": 0.541019955654102, "grad_norm": 0.42774298787117004, "kl": 0.07650378346443176, "learning_rate": 4.8699511236839865e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1952 }, { "completion_length": 472.75, "epoch": 0.5412971175166297, "grad_norm": 0.3888177275657654, "kl": 0.11355962604284286, "learning_rate": 4.869811725774194e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1953 }, { "completion_length": 415.25, "epoch": 0.5415742793791575, "grad_norm": 0.4104337692260742, "kl": 0.0768284946680069, "learning_rate": 4.869672255192134e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1954 }, { "completion_length": 378.5, "epoch": 0.5418514412416852, "grad_norm": 0.4493911564350128, "kl": 0.0937601700425148, "learning_rate": 4.869532711942083e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1955 }, { "completion_length": 490.0, "epoch": 0.5421286031042128, "grad_norm": 1.4533663988113403, "kl": 0.07071108371019363, "learning_rate": 4.869393096028321e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1956 }, { "completion_length": 392.25, "epoch": 0.5424057649667405, "grad_norm": 0.4111229479312897, "kl": 0.09110666066408157, "learning_rate": 4.869253407455129e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1957 }, { "completion_length": 395.5, "epoch": 0.5426829268292683, "grad_norm": 0.3897128701210022, "kl": 0.08470732718706131, "learning_rate": 4.869113646226791e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1958 }, { "completion_length": 425.25, "epoch": 0.542960088691796, "grad_norm": 0.0, "kl": 0.07135873287916183, "learning_rate": 4.8689738123475924e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1959 }, { "completion_length": 447.5, "epoch": 0.5432372505543237, "grad_norm": 0.38100743293762207, "kl": 0.06862633675336838, "learning_rate": 4.868833905821821e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1960 }, { "completion_length": 411.0, "epoch": 0.5435144124168514, "grad_norm": 0.3761445879936218, "kl": 0.08353046327829361, "learning_rate": 4.8686939266537695e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1961 }, { "completion_length": 387.25, "epoch": 0.5437915742793792, "grad_norm": 0.0, "kl": 0.08259803056716919, "learning_rate": 4.868553874847728e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1962 }, { "completion_length": 439.5, "epoch": 0.5440687361419069, "grad_norm": 0.0, "kl": 0.08138933032751083, "learning_rate": 4.868413750407992e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1963 }, { "completion_length": 388.0, "epoch": 0.5443458980044346, "grad_norm": 0.37669432163238525, "kl": 0.07308677583932877, "learning_rate": 4.868273553338859e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1964 }, { "completion_length": 422.25, "epoch": 0.5446230598669624, "grad_norm": 0.38024264574050903, "kl": 0.0899311900138855, "learning_rate": 4.868133283644627e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1965 }, { "completion_length": 441.0, "epoch": 0.54490022172949, "grad_norm": 0.0, "kl": 0.09054236114025116, "learning_rate": 4.867992941329599e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1966 }, { "completion_length": 455.0, "epoch": 0.5451773835920177, "grad_norm": 0.4272405505180359, "kl": 0.06510865688323975, "learning_rate": 4.867852526398078e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1967 }, { "completion_length": 443.0, "epoch": 0.5454545454545454, "grad_norm": 0.0, "kl": 0.06215916574001312, "learning_rate": 4.867712038854371e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1968 }, { "completion_length": 349.5, "epoch": 0.5457317073170732, "grad_norm": 0.5196641683578491, "kl": 0.09616449475288391, "learning_rate": 4.867571478702784e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1969 }, { "completion_length": 399.75, "epoch": 0.5460088691796009, "grad_norm": 0.41690242290496826, "kl": 0.08976279199123383, "learning_rate": 4.8674308459476284e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1970 }, { "completion_length": 422.75, "epoch": 0.5462860310421286, "grad_norm": 0.3978976011276245, "kl": 0.06694278120994568, "learning_rate": 4.867290140593218e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1971 }, { "completion_length": 379.25, "epoch": 0.5465631929046563, "grad_norm": 0.4533667266368866, "kl": 0.07549519836902618, "learning_rate": 4.867149362643866e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1972 }, { "completion_length": 405.25, "epoch": 0.5468403547671841, "grad_norm": 0.0, "kl": 0.07830343395471573, "learning_rate": 4.86700851210389e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1973 }, { "completion_length": 400.0, "epoch": 0.5471175166297118, "grad_norm": 0.43845731019973755, "kl": 0.061392687261104584, "learning_rate": 4.8668675889776095e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1974 }, { "completion_length": 422.0, "epoch": 0.5473946784922394, "grad_norm": 0.3625909984111786, "kl": 0.09229827672243118, "learning_rate": 4.866726593269346e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1975 }, { "completion_length": 393.75, "epoch": 0.5476718403547672, "grad_norm": 0.0, "kl": 0.08100161701440811, "learning_rate": 4.866585524983424e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1976 }, { "completion_length": 475.0, "epoch": 0.5479490022172949, "grad_norm": 0.0, "kl": 0.06095666438341141, "learning_rate": 4.866444384124168e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1977 }, { "completion_length": 396.75, "epoch": 0.5482261640798226, "grad_norm": 0.4323905408382416, "kl": 0.07816989719867706, "learning_rate": 4.866303170695908e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1978 }, { "completion_length": 490.0, "epoch": 0.5485033259423503, "grad_norm": 0.5870019197463989, "kl": 0.07068484276533127, "learning_rate": 4.866161884702972e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1979 }, { "completion_length": 323.25, "epoch": 0.5487804878048781, "grad_norm": 0.0, "kl": 0.0884968563914299, "learning_rate": 4.866020526149694e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1980 }, { "completion_length": 456.5, "epoch": 0.5490576496674058, "grad_norm": 0.4243658781051636, "kl": 0.06095510348677635, "learning_rate": 4.86587909504041e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1981 }, { "completion_length": 433.75, "epoch": 0.5493348115299335, "grad_norm": 0.4320809841156006, "kl": 0.09927143901586533, "learning_rate": 4.865737591379455e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1982 }, { "completion_length": 465.25, "epoch": 0.5496119733924612, "grad_norm": 0.3704809546470642, "kl": 0.07017114758491516, "learning_rate": 4.865596015171169e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1983 }, { "completion_length": 419.25, "epoch": 0.549889135254989, "grad_norm": 0.0, "kl": 0.09869801253080368, "learning_rate": 4.865454366419895e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1984 }, { "completion_length": 411.0, "epoch": 0.5501662971175166, "grad_norm": 0.40709954500198364, "kl": 0.08120595663785934, "learning_rate": 4.8653126451299745e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1985 }, { "completion_length": 505.25, "epoch": 0.5504434589800443, "grad_norm": 0.29168859124183655, "kl": 0.08087807148694992, "learning_rate": 4.865170851305755e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1986 }, { "completion_length": 420.5, "epoch": 0.5507206208425721, "grad_norm": 0.0, "kl": 0.08622800558805466, "learning_rate": 4.865028984951585e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1987 }, { "completion_length": 422.5, "epoch": 0.5509977827050998, "grad_norm": 0.43533894419670105, "kl": 0.06784852594137192, "learning_rate": 4.864887046071814e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1988 }, { "completion_length": 360.5, "epoch": 0.5512749445676275, "grad_norm": 0.502321183681488, "kl": 0.08519326150417328, "learning_rate": 4.864745034670794e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1989 }, { "completion_length": 360.75, "epoch": 0.5515521064301552, "grad_norm": 0.3893285393714905, "kl": 0.07249588519334793, "learning_rate": 4.864602950752882e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1990 }, { "completion_length": 390.5, "epoch": 0.551829268292683, "grad_norm": 0.0, "kl": 0.17719675600528717, "learning_rate": 4.864460794322433e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1991 }, { "completion_length": 348.75, "epoch": 0.5521064301552107, "grad_norm": 0.5264366269111633, "kl": 0.08098740875720978, "learning_rate": 4.864318565383809e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1992 }, { "completion_length": 362.75, "epoch": 0.5523835920177383, "grad_norm": 0.0, "kl": 0.08347699791193008, "learning_rate": 4.864176263941368e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1993 }, { "completion_length": 426.0, "epoch": 0.552660753880266, "grad_norm": 0.485256165266037, "kl": 0.08093965798616409, "learning_rate": 4.864033889999477e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1994 }, { "completion_length": 435.25, "epoch": 0.5529379157427938, "grad_norm": 0.0, "kl": 0.07590829581022263, "learning_rate": 4.863891443562501e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1995 }, { "completion_length": 342.25, "epoch": 0.5532150776053215, "grad_norm": 0.5530595183372498, "kl": 0.09168769419193268, "learning_rate": 4.863748924634807e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1996 }, { "completion_length": 396.5, "epoch": 0.5534922394678492, "grad_norm": 0.6291319131851196, "kl": 0.07900229096412659, "learning_rate": 4.863606333220767e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1997 }, { "completion_length": 407.75, "epoch": 0.5537694013303769, "grad_norm": 0.0, "kl": 0.07352367043495178, "learning_rate": 4.863463669324753e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1998 }, { "completion_length": 367.5, "epoch": 0.5540465631929047, "grad_norm": 0.5038084387779236, "kl": 0.10279843211174011, "learning_rate": 4.863320932951139e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 1999 }, { "completion_length": 340.75, "epoch": 0.5543237250554324, "grad_norm": 0.0, "kl": 0.09154395014047623, "learning_rate": 4.863178124104305e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2000 }, { "completion_length": 434.25, "epoch": 0.5546008869179601, "grad_norm": 0.4284912049770355, "kl": 0.07059644162654877, "learning_rate": 4.863035242788627e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2001 }, { "completion_length": 433.75, "epoch": 0.5548780487804879, "grad_norm": 0.4767213761806488, "kl": 0.06452896445989609, "learning_rate": 4.862892289008489e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2002 }, { "completion_length": 504.5, "epoch": 0.5551552106430155, "grad_norm": 0.39146688580513, "kl": 0.07132889330387115, "learning_rate": 4.8627492627682735e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2003 }, { "completion_length": 421.0, "epoch": 0.5554323725055432, "grad_norm": 0.40409475564956665, "kl": 0.2986583709716797, "learning_rate": 4.862606164072367e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2004 }, { "completion_length": 426.5, "epoch": 0.5557095343680709, "grad_norm": 0.0, "kl": 0.08143936097621918, "learning_rate": 4.862462992925157e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2005 }, { "completion_length": 352.75, "epoch": 0.5559866962305987, "grad_norm": 0.4682004451751709, "kl": 0.10901257395744324, "learning_rate": 4.862319749331036e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2006 }, { "completion_length": 428.5, "epoch": 0.5562638580931264, "grad_norm": 0.3607328236103058, "kl": 0.07551195472478867, "learning_rate": 4.862176433294394e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2007 }, { "completion_length": 405.75, "epoch": 0.5565410199556541, "grad_norm": 0.3914806544780731, "kl": 0.080589160323143, "learning_rate": 4.862033044819628e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2008 }, { "completion_length": 367.75, "epoch": 0.5568181818181818, "grad_norm": 0.0, "kl": 0.08879150450229645, "learning_rate": 4.861889583911134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2009 }, { "completion_length": 472.25, "epoch": 0.5570953436807096, "grad_norm": 0.3891927897930145, "kl": 0.07432907819747925, "learning_rate": 4.861746050573311e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2010 }, { "completion_length": 399.0, "epoch": 0.5573725055432373, "grad_norm": 0.4481830298900604, "kl": 0.1030120998620987, "learning_rate": 4.861602444810562e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2011 }, { "completion_length": 467.75, "epoch": 0.5576496674057649, "grad_norm": 0.0, "kl": 0.0725511685013771, "learning_rate": 4.86145876662729e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2012 }, { "completion_length": 385.0, "epoch": 0.5579268292682927, "grad_norm": 0.49397122859954834, "kl": 0.07571981847286224, "learning_rate": 4.861315016027902e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2013 }, { "completion_length": 397.0, "epoch": 0.5582039911308204, "grad_norm": 0.40450185537338257, "kl": 0.08026912808418274, "learning_rate": 4.861171193016804e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2014 }, { "completion_length": 431.75, "epoch": 0.5584811529933481, "grad_norm": 0.37481704354286194, "kl": 0.06852862238883972, "learning_rate": 4.861027297598408e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2015 }, { "completion_length": 455.0, "epoch": 0.5587583148558758, "grad_norm": 0.44570398330688477, "kl": 0.08721420913934708, "learning_rate": 4.860883329777126e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2016 }, { "completion_length": 462.5, "epoch": 0.5590354767184036, "grad_norm": 0.3780376613140106, "kl": 0.07578998804092407, "learning_rate": 4.860739289557374e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2017 }, { "completion_length": 418.0, "epoch": 0.5593126385809313, "grad_norm": 0.4447566568851471, "kl": 0.07867103815078735, "learning_rate": 4.860595176943569e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2018 }, { "completion_length": 489.75, "epoch": 0.559589800443459, "grad_norm": 0.3771217167377472, "kl": 0.06795814633369446, "learning_rate": 4.860450991940129e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2019 }, { "completion_length": 433.5, "epoch": 0.5598669623059866, "grad_norm": 0.0, "kl": 0.07583148777484894, "learning_rate": 4.860306734551476e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2020 }, { "completion_length": 381.5, "epoch": 0.5601441241685144, "grad_norm": 0.0, "kl": 0.06576041877269745, "learning_rate": 4.860162404782034e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2021 }, { "completion_length": 415.0, "epoch": 0.5604212860310421, "grad_norm": 0.0, "kl": 0.0841163918375969, "learning_rate": 4.86001800263623e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2022 }, { "completion_length": 380.25, "epoch": 0.5606984478935698, "grad_norm": 0.4240904450416565, "kl": 0.09775390475988388, "learning_rate": 4.859873528118491e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2023 }, { "completion_length": 424.25, "epoch": 0.5609756097560976, "grad_norm": 0.4125356376171112, "kl": 0.10658107697963715, "learning_rate": 4.859728981233247e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2024 }, { "completion_length": 386.5, "epoch": 0.5612527716186253, "grad_norm": 0.8889422416687012, "kl": 0.07164755463600159, "learning_rate": 4.859584361984932e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2025 }, { "completion_length": 384.75, "epoch": 0.561529933481153, "grad_norm": 0.4556144177913666, "kl": 0.10818961262702942, "learning_rate": 4.85943967037798e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2026 }, { "completion_length": 446.0, "epoch": 0.5618070953436807, "grad_norm": 0.0, "kl": 0.07908956706523895, "learning_rate": 4.8592949064168285e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2027 }, { "completion_length": 496.25, "epoch": 0.5620842572062085, "grad_norm": 0.44047775864601135, "kl": 0.0738312155008316, "learning_rate": 4.859150070105917e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2028 }, { "completion_length": 362.0, "epoch": 0.5623614190687362, "grad_norm": 0.0, "kl": 0.07806842774152756, "learning_rate": 4.8590051614496855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2029 }, { "completion_length": 395.75, "epoch": 0.5626385809312638, "grad_norm": 0.0, "kl": 0.07010519504547119, "learning_rate": 4.85886018045258e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2030 }, { "completion_length": 416.25, "epoch": 0.5629157427937915, "grad_norm": 0.0, "kl": 0.07375761866569519, "learning_rate": 4.858715127119045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2031 }, { "completion_length": 352.25, "epoch": 0.5631929046563193, "grad_norm": 0.0, "kl": 0.36886054277420044, "learning_rate": 4.858570001453529e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2032 }, { "completion_length": 375.0, "epoch": 0.563470066518847, "grad_norm": 0.45369040966033936, "kl": 0.08682025969028473, "learning_rate": 4.858424803460483e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2033 }, { "completion_length": 485.75, "epoch": 0.5637472283813747, "grad_norm": 0.40626317262649536, "kl": 0.05944341793656349, "learning_rate": 4.858279533144358e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2034 }, { "completion_length": 412.5, "epoch": 0.5640243902439024, "grad_norm": 0.41761675477027893, "kl": 0.0823737233877182, "learning_rate": 4.8581341905096104e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2035 }, { "completion_length": 423.75, "epoch": 0.5643015521064302, "grad_norm": 0.38990920782089233, "kl": 0.07848712801933289, "learning_rate": 4.857988775560697e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2036 }, { "completion_length": 378.5, "epoch": 0.5645787139689579, "grad_norm": 0.48267796635627747, "kl": 0.08367615193128586, "learning_rate": 4.857843288302077e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2037 }, { "completion_length": 418.5, "epoch": 0.5648558758314856, "grad_norm": 0.3789375126361847, "kl": 0.07538319379091263, "learning_rate": 4.8576977287382116e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2038 }, { "completion_length": 320.75, "epoch": 0.5651330376940134, "grad_norm": 0.5198972821235657, "kl": 0.0961187556385994, "learning_rate": 4.857552096873564e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2039 }, { "completion_length": 371.75, "epoch": 0.565410199556541, "grad_norm": 0.4234837591648102, "kl": 0.08028572052717209, "learning_rate": 4.857406392712602e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2040 }, { "completion_length": 370.5, "epoch": 0.5656873614190687, "grad_norm": 0.4055730998516083, "kl": 0.09478407353162766, "learning_rate": 4.857260616259791e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2041 }, { "completion_length": 369.5, "epoch": 0.5659645232815964, "grad_norm": 0.43018221855163574, "kl": 0.09607990086078644, "learning_rate": 4.857114767519604e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2042 }, { "completion_length": 408.0, "epoch": 0.5662416851441242, "grad_norm": 0.3914042115211487, "kl": 0.07043670862913132, "learning_rate": 4.856968846496512e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2043 }, { "completion_length": 393.0, "epoch": 0.5665188470066519, "grad_norm": 0.0, "kl": 0.1732073575258255, "learning_rate": 4.856822853194991e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2044 }, { "completion_length": 425.75, "epoch": 0.5667960088691796, "grad_norm": 0.4188489317893982, "kl": 0.075436070561409, "learning_rate": 4.856676787619517e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2045 }, { "completion_length": 393.0, "epoch": 0.5670731707317073, "grad_norm": 0.4208044409751892, "kl": 0.08632692694664001, "learning_rate": 4.856530649774568e-06, "loss": 0.0, "reward": 4.59375, "reward_std": 1.9185905456542969, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 2046 }, { "completion_length": 418.25, "epoch": 0.5673503325942351, "grad_norm": 0.3750656843185425, "kl": 0.061638422310352325, "learning_rate": 4.856384439664628e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2047 }, { "completion_length": 427.0, "epoch": 0.5676274944567627, "grad_norm": 0.0, "kl": 0.07102501392364502, "learning_rate": 4.85623815729418e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2048 }, { "completion_length": 428.25, "epoch": 0.5679046563192904, "grad_norm": 0.0, "kl": 0.0680244192481041, "learning_rate": 4.856091802667708e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2049 }, { "completion_length": 360.0, "epoch": 0.5681818181818182, "grad_norm": 0.0, "kl": 0.1811152845621109, "learning_rate": 4.855945375789703e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2050 }, { "completion_length": 519.75, "epoch": 0.5684589800443459, "grad_norm": 0.3435584008693695, "kl": 0.0706031545996666, "learning_rate": 4.855798876664653e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2051 }, { "completion_length": 370.0, "epoch": 0.5687361419068736, "grad_norm": 0.4070250689983368, "kl": 0.08408551663160324, "learning_rate": 4.855652305297052e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2052 }, { "completion_length": 398.75, "epoch": 0.5690133037694013, "grad_norm": 0.4246976375579834, "kl": 0.09300495684146881, "learning_rate": 4.855505661691393e-06, "loss": 0.0, "reward": 3.625, "reward_std": 2.462214469909668, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2053 }, { "completion_length": 341.5, "epoch": 0.5692904656319291, "grad_norm": 0.0, "kl": 0.09149360656738281, "learning_rate": 4.855358945852175e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2054 }, { "completion_length": 415.0, "epoch": 0.5695676274944568, "grad_norm": 0.4355056881904602, "kl": 0.10182805359363556, "learning_rate": 4.855212157783896e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2055 }, { "completion_length": 382.75, "epoch": 0.5698447893569845, "grad_norm": 0.43667200207710266, "kl": 0.10300454497337341, "learning_rate": 4.855065297491057e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2056 }, { "completion_length": 424.25, "epoch": 0.5701219512195121, "grad_norm": 0.3667008578777313, "kl": 0.11974328011274338, "learning_rate": 4.854918364978163e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2057 }, { "completion_length": 438.5, "epoch": 0.5703991130820399, "grad_norm": 0.0, "kl": 0.07686755061149597, "learning_rate": 4.854771360249718e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2058 }, { "completion_length": 420.75, "epoch": 0.5706762749445676, "grad_norm": 0.34406888484954834, "kl": 0.09758276492357254, "learning_rate": 4.854624283310232e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2059 }, { "completion_length": 373.25, "epoch": 0.5709534368070953, "grad_norm": 0.44436025619506836, "kl": 0.09720689803361893, "learning_rate": 4.8544771341642136e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2060 }, { "completion_length": 378.25, "epoch": 0.5712305986696231, "grad_norm": 0.0, "kl": 0.07217516750097275, "learning_rate": 4.854329912816176e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2061 }, { "completion_length": 413.5, "epoch": 0.5715077605321508, "grad_norm": 0.4425058960914612, "kl": 0.08937069773674011, "learning_rate": 4.854182619270634e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2062 }, { "completion_length": 393.5, "epoch": 0.5717849223946785, "grad_norm": 0.4304827153682709, "kl": 0.11474283784627914, "learning_rate": 4.854035253532103e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2063 }, { "completion_length": 376.5, "epoch": 0.5720620842572062, "grad_norm": 0.0, "kl": 0.08253751695156097, "learning_rate": 4.853887815605105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2064 }, { "completion_length": 408.25, "epoch": 0.572339246119734, "grad_norm": 0.34867286682128906, "kl": 0.08981402963399887, "learning_rate": 4.853740305494159e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2065 }, { "completion_length": 463.5, "epoch": 0.5726164079822617, "grad_norm": 0.3333205282688141, "kl": 0.11488942801952362, "learning_rate": 4.853592723203788e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2066 }, { "completion_length": 435.75, "epoch": 0.5728935698447893, "grad_norm": 0.0, "kl": 0.08136173337697983, "learning_rate": 4.853445068738521e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2067 }, { "completion_length": 427.75, "epoch": 0.573170731707317, "grad_norm": 0.0, "kl": 0.09409529715776443, "learning_rate": 4.853297342102882e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2068 }, { "completion_length": 378.5, "epoch": 0.5734478935698448, "grad_norm": 0.0, "kl": 0.11334504932165146, "learning_rate": 4.853149543301404e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2069 }, { "completion_length": 471.75, "epoch": 0.5737250554323725, "grad_norm": 0.4198521077632904, "kl": 0.06820107996463776, "learning_rate": 4.853001672338618e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2070 }, { "completion_length": 405.0, "epoch": 0.5740022172949002, "grad_norm": 0.4554463028907776, "kl": 0.10267701745033264, "learning_rate": 4.85285372921906e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2071 }, { "completion_length": 488.0, "epoch": 0.5742793791574279, "grad_norm": 0.0, "kl": 0.07853063941001892, "learning_rate": 4.852705713947264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2072 }, { "completion_length": 329.5, "epoch": 0.5745565410199557, "grad_norm": 0.0, "kl": 0.07258035987615585, "learning_rate": 4.852557626527771e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2073 }, { "completion_length": 425.5, "epoch": 0.5748337028824834, "grad_norm": 0.0, "kl": 0.09013406187295914, "learning_rate": 4.852409466965123e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2074 }, { "completion_length": 382.25, "epoch": 0.575110864745011, "grad_norm": 0.0, "kl": 0.17013394832611084, "learning_rate": 4.852261235263862e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2075 }, { "completion_length": 376.0, "epoch": 0.5753880266075388, "grad_norm": 0.0, "kl": 0.07985862344503403, "learning_rate": 4.852112931428534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2076 }, { "completion_length": 506.5, "epoch": 0.5756651884700665, "grad_norm": 0.0, "kl": 0.07644307613372803, "learning_rate": 4.851964555463687e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2077 }, { "completion_length": 347.75, "epoch": 0.5759423503325942, "grad_norm": 0.4943073093891144, "kl": 0.08451520651578903, "learning_rate": 4.851816107373871e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2078 }, { "completion_length": 360.25, "epoch": 0.5762195121951219, "grad_norm": 0.0, "kl": 0.08886346220970154, "learning_rate": 4.851667587163638e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2079 }, { "completion_length": 411.75, "epoch": 0.5764966740576497, "grad_norm": 0.0, "kl": 0.08078023791313171, "learning_rate": 4.851518994837544e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2080 }, { "completion_length": 416.75, "epoch": 0.5767738359201774, "grad_norm": 0.3676814138889313, "kl": 0.07649673521518707, "learning_rate": 4.851370330400143e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2081 }, { "completion_length": 483.5, "epoch": 0.5770509977827051, "grad_norm": 0.3434486985206604, "kl": 0.07566536217927933, "learning_rate": 4.8512215938559955e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2082 }, { "completion_length": 352.0, "epoch": 0.5773281596452328, "grad_norm": 0.42923688888549805, "kl": 0.09054207056760788, "learning_rate": 4.851072785209664e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2083 }, { "completion_length": 389.25, "epoch": 0.5776053215077606, "grad_norm": 0.0, "kl": 0.09555219858884811, "learning_rate": 4.850923904465709e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2084 }, { "completion_length": 355.0, "epoch": 0.5778824833702882, "grad_norm": 0.5888808965682983, "kl": 0.0948779508471489, "learning_rate": 4.850774951628698e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2085 }, { "completion_length": 385.25, "epoch": 0.5781596452328159, "grad_norm": 0.4598921537399292, "kl": 0.08381164073944092, "learning_rate": 4.850625926703198e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2086 }, { "completion_length": 361.5, "epoch": 0.5784368070953437, "grad_norm": 0.0, "kl": 0.09823485463857651, "learning_rate": 4.85047682969378e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2087 }, { "completion_length": 389.75, "epoch": 0.5787139689578714, "grad_norm": 0.0, "kl": 0.09496559202671051, "learning_rate": 4.850327660605014e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2088 }, { "completion_length": 396.25, "epoch": 0.5789911308203991, "grad_norm": 0.3983772099018097, "kl": 0.12101321667432785, "learning_rate": 4.850178419441477e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2089 }, { "completion_length": 497.0, "epoch": 0.5792682926829268, "grad_norm": 0.0, "kl": 0.08230748772621155, "learning_rate": 4.850029106207744e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2090 }, { "completion_length": 431.0, "epoch": 0.5795454545454546, "grad_norm": 0.43412432074546814, "kl": 0.08579795062541962, "learning_rate": 4.849879720908394e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2091 }, { "completion_length": 426.75, "epoch": 0.5798226164079823, "grad_norm": 0.0, "kl": 0.08954225480556488, "learning_rate": 4.849730263548008e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2092 }, { "completion_length": 502.0, "epoch": 0.58009977827051, "grad_norm": 0.0, "kl": 0.07394362986087799, "learning_rate": 4.84958073413117e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2093 }, { "completion_length": 377.75, "epoch": 0.5803769401330376, "grad_norm": 0.5914613604545593, "kl": 0.08022619783878326, "learning_rate": 4.849431132662464e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2094 }, { "completion_length": 427.25, "epoch": 0.5806541019955654, "grad_norm": 0.41911405324935913, "kl": 0.09631370007991791, "learning_rate": 4.84928145914648e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2095 }, { "completion_length": 337.5, "epoch": 0.5809312638580931, "grad_norm": 0.0, "kl": 0.08756314963102341, "learning_rate": 4.849131713587805e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2096 }, { "completion_length": 377.75, "epoch": 0.5812084257206208, "grad_norm": 0.42163705825805664, "kl": 0.12144008278846741, "learning_rate": 4.848981895991033e-06, "loss": -0.0, "reward": 2.5, "reward_std": 2.217355728149414, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2097 }, { "completion_length": 372.75, "epoch": 0.5814855875831486, "grad_norm": 0.4479188919067383, "kl": 0.0995790958404541, "learning_rate": 4.848832006360758e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2098 }, { "completion_length": 432.75, "epoch": 0.5817627494456763, "grad_norm": 0.44523537158966064, "kl": 0.09519080072641373, "learning_rate": 4.848682044701576e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2099 }, { "completion_length": 374.0, "epoch": 0.582039911308204, "grad_norm": 0.46866223216056824, "kl": 0.13739749789237976, "learning_rate": 4.848532011018086e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2100 }, { "completion_length": 359.0, "epoch": 0.5823170731707317, "grad_norm": 0.0, "kl": 0.09761591255664825, "learning_rate": 4.8483819053148895e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2101 }, { "completion_length": 405.25, "epoch": 0.5825942350332595, "grad_norm": 0.41989704966545105, "kl": 0.09335498511791229, "learning_rate": 4.848231727596588e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2102 }, { "completion_length": 441.75, "epoch": 0.5828713968957872, "grad_norm": 0.0, "kl": 0.08952996134757996, "learning_rate": 4.8480814778677885e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2103 }, { "completion_length": 400.75, "epoch": 0.5831485587583148, "grad_norm": 0.0, "kl": 0.09220411628484726, "learning_rate": 4.847931156133097e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2104 }, { "completion_length": 444.0, "epoch": 0.5834257206208425, "grad_norm": 0.0, "kl": 0.11016608029603958, "learning_rate": 4.847780762397125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2105 }, { "completion_length": 482.5, "epoch": 0.5837028824833703, "grad_norm": 0.0, "kl": 0.08719482272863388, "learning_rate": 4.8476302966644835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2106 }, { "completion_length": 462.0, "epoch": 0.583980044345898, "grad_norm": 0.0, "kl": 0.09465502947568893, "learning_rate": 4.847479758939787e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2107 }, { "completion_length": 348.0, "epoch": 0.5842572062084257, "grad_norm": 0.608462929725647, "kl": 0.079344742000103, "learning_rate": 4.847329149227651e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2108 }, { "completion_length": 396.25, "epoch": 0.5845343680709535, "grad_norm": 0.0, "kl": 0.0874602347612381, "learning_rate": 4.847178467532694e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2109 }, { "completion_length": 387.25, "epoch": 0.5848115299334812, "grad_norm": 0.0, "kl": 0.095375657081604, "learning_rate": 4.847027713859538e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2110 }, { "completion_length": 411.75, "epoch": 0.5850886917960089, "grad_norm": 0.42489326000213623, "kl": 0.07993526756763458, "learning_rate": 4.846876888212806e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2111 }, { "completion_length": 388.25, "epoch": 0.5853658536585366, "grad_norm": 0.0, "kl": 0.08309777826070786, "learning_rate": 4.846725990597122e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2112 }, { "completion_length": 391.75, "epoch": 0.5856430155210643, "grad_norm": 0.0, "kl": 0.09750977158546448, "learning_rate": 4.846575021017114e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2113 }, { "completion_length": 400.0, "epoch": 0.585920177383592, "grad_norm": 0.5122290253639221, "kl": 0.11352474242448807, "learning_rate": 4.846423979477411e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2114 }, { "completion_length": 464.25, "epoch": 0.5861973392461197, "grad_norm": 0.0, "kl": 0.09199882298707962, "learning_rate": 4.846272865982647e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2115 }, { "completion_length": 421.25, "epoch": 0.5864745011086474, "grad_norm": 0.0, "kl": 0.08745948225259781, "learning_rate": 4.846121680537453e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2116 }, { "completion_length": 431.25, "epoch": 0.5867516629711752, "grad_norm": 0.0, "kl": 0.0885947048664093, "learning_rate": 4.845970423146468e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2117 }, { "completion_length": 437.0, "epoch": 0.5870288248337029, "grad_norm": 0.0, "kl": 0.10651073604822159, "learning_rate": 4.845819093814328e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2118 }, { "completion_length": 460.5, "epoch": 0.5873059866962306, "grad_norm": 0.0, "kl": 0.08866751939058304, "learning_rate": 4.845667692545675e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2119 }, { "completion_length": 411.5, "epoch": 0.5875831485587583, "grad_norm": 0.0, "kl": 0.08958540856838226, "learning_rate": 4.8455162193451525e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2120 }, { "completion_length": 417.5, "epoch": 0.5878603104212861, "grad_norm": 0.0, "kl": 0.09370159357786179, "learning_rate": 4.8453646742174035e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2121 }, { "completion_length": 327.25, "epoch": 0.5881374722838137, "grad_norm": 0.0, "kl": 0.10719772428274155, "learning_rate": 4.8452130571670764e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2122 }, { "completion_length": 494.0, "epoch": 0.5884146341463414, "grad_norm": 0.0, "kl": 0.08654297143220901, "learning_rate": 4.8450613681988215e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2123 }, { "completion_length": 370.25, "epoch": 0.5886917960088692, "grad_norm": 0.0, "kl": 0.08382734656333923, "learning_rate": 4.844909607317289e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2124 }, { "completion_length": 378.25, "epoch": 0.5889689578713969, "grad_norm": 0.0, "kl": 0.08250518888235092, "learning_rate": 4.844757774527134e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2125 }, { "completion_length": 415.5, "epoch": 0.5892461197339246, "grad_norm": 0.0, "kl": 0.0810188353061676, "learning_rate": 4.844605869833011e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2126 }, { "completion_length": 428.25, "epoch": 0.5895232815964523, "grad_norm": 0.0, "kl": 0.11027291417121887, "learning_rate": 4.844453893239581e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2127 }, { "completion_length": 372.25, "epoch": 0.5898004434589801, "grad_norm": 0.46373671293258667, "kl": 0.07606393843889236, "learning_rate": 4.844301844751501e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2128 }, { "completion_length": 354.25, "epoch": 0.5900776053215078, "grad_norm": 0.0, "kl": 0.09883329272270203, "learning_rate": 4.844149724373437e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2129 }, { "completion_length": 482.75, "epoch": 0.5903547671840355, "grad_norm": 0.5259829163551331, "kl": 0.08575250208377838, "learning_rate": 4.843997532110051e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2130 }, { "completion_length": 336.75, "epoch": 0.5906319290465631, "grad_norm": 0.40383344888687134, "kl": 0.14909397065639496, "learning_rate": 4.843845267966012e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2131 }, { "completion_length": 410.75, "epoch": 0.5909090909090909, "grad_norm": 0.4841603934764862, "kl": 0.1005832701921463, "learning_rate": 4.843692931945989e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2132 }, { "completion_length": 370.5, "epoch": 0.5911862527716186, "grad_norm": 0.0, "kl": 0.1016218289732933, "learning_rate": 4.8435405240546534e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2133 }, { "completion_length": 362.5, "epoch": 0.5914634146341463, "grad_norm": 0.0, "kl": 0.09156270325183868, "learning_rate": 4.843388044296679e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2134 }, { "completion_length": 377.75, "epoch": 0.5917405764966741, "grad_norm": 0.0, "kl": 0.10039643943309784, "learning_rate": 4.843235492676741e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2135 }, { "completion_length": 369.25, "epoch": 0.5920177383592018, "grad_norm": 0.0, "kl": 0.1256626546382904, "learning_rate": 4.843082869199518e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2136 }, { "completion_length": 387.75, "epoch": 0.5922949002217295, "grad_norm": 0.0, "kl": 0.13076767325401306, "learning_rate": 4.84293017386969e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2137 }, { "completion_length": 405.0, "epoch": 0.5925720620842572, "grad_norm": 0.4271929860115051, "kl": 0.07793687283992767, "learning_rate": 4.842777406691941e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2138 }, { "completion_length": 417.75, "epoch": 0.592849223946785, "grad_norm": 0.5120920538902283, "kl": 0.07241766899824142, "learning_rate": 4.842624567670954e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2139 }, { "completion_length": 399.75, "epoch": 0.5931263858093127, "grad_norm": 0.7687183618545532, "kl": 0.10597825795412064, "learning_rate": 4.842471656811416e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2140 }, { "completion_length": 368.0, "epoch": 0.5934035476718403, "grad_norm": 0.0, "kl": 0.11928553879261017, "learning_rate": 4.842318674118019e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2141 }, { "completion_length": 407.0, "epoch": 0.593680709534368, "grad_norm": 0.0, "kl": 0.09048278629779816, "learning_rate": 4.84216561959545e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2142 }, { "completion_length": 382.75, "epoch": 0.5939578713968958, "grad_norm": 0.0, "kl": 0.08284194767475128, "learning_rate": 4.842012493248405e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2143 }, { "completion_length": 378.75, "epoch": 0.5942350332594235, "grad_norm": 0.0, "kl": 0.1258024126291275, "learning_rate": 4.841859295081579e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2144 }, { "completion_length": 343.75, "epoch": 0.5945121951219512, "grad_norm": 0.0, "kl": 0.09882085770368576, "learning_rate": 4.841706025099671e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2145 }, { "completion_length": 425.5, "epoch": 0.594789356984479, "grad_norm": 0.0, "kl": 0.07497615367174149, "learning_rate": 4.84155268330738e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2146 }, { "completion_length": 368.0, "epoch": 0.5950665188470067, "grad_norm": 0.5074739456176758, "kl": 0.10598649084568024, "learning_rate": 4.841399269709409e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2147 }, { "completion_length": 339.0, "epoch": 0.5953436807095344, "grad_norm": 0.4836592674255371, "kl": 0.1118423193693161, "learning_rate": 4.841245784310463e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2148 }, { "completion_length": 417.25, "epoch": 0.595620842572062, "grad_norm": 0.4227878153324127, "kl": 0.1392248421907425, "learning_rate": 4.841092227115247e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2149 }, { "completion_length": 399.75, "epoch": 0.5958980044345898, "grad_norm": 0.0, "kl": 0.09069311618804932, "learning_rate": 4.840938598128472e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2150 }, { "completion_length": 424.5, "epoch": 0.5961751662971175, "grad_norm": 0.43256881833076477, "kl": 0.07531581073999405, "learning_rate": 4.840784897354848e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2151 }, { "completion_length": 397.75, "epoch": 0.5964523281596452, "grad_norm": 0.0, "kl": 0.09857112169265747, "learning_rate": 4.840631124799089e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2152 }, { "completion_length": 369.0, "epoch": 0.5967294900221729, "grad_norm": 0.0, "kl": 0.10717061907052994, "learning_rate": 4.84047728046591e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2153 }, { "completion_length": 451.25, "epoch": 0.5970066518847007, "grad_norm": 0.0, "kl": 0.10451241582632065, "learning_rate": 4.840323364360028e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2154 }, { "completion_length": 354.5, "epoch": 0.5972838137472284, "grad_norm": 0.0, "kl": 0.1049930527806282, "learning_rate": 4.8401693764861644e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2155 }, { "completion_length": 411.0, "epoch": 0.5975609756097561, "grad_norm": 0.0, "kl": 0.09631093591451645, "learning_rate": 4.840015316849042e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2156 }, { "completion_length": 374.5, "epoch": 0.5978381374722838, "grad_norm": 0.0, "kl": 0.08334728330373764, "learning_rate": 4.839861185453383e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2157 }, { "completion_length": 388.75, "epoch": 0.5981152993348116, "grad_norm": 0.4713943302631378, "kl": 0.08323488384485245, "learning_rate": 4.839706982303915e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2158 }, { "completion_length": 439.5, "epoch": 0.5983924611973392, "grad_norm": 0.3573652505874634, "kl": 0.0815349817276001, "learning_rate": 4.839552707405367e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2159 }, { "completion_length": 365.25, "epoch": 0.5986696230598669, "grad_norm": 0.0, "kl": 0.07600024342536926, "learning_rate": 4.839398360762469e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2160 }, { "completion_length": 405.75, "epoch": 0.5989467849223947, "grad_norm": 0.40844935178756714, "kl": 0.1224682554602623, "learning_rate": 4.839243942379957e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2161 }, { "completion_length": 367.0, "epoch": 0.5992239467849224, "grad_norm": 0.4109137952327728, "kl": 0.11636735498905182, "learning_rate": 4.839089452262562e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2162 }, { "completion_length": 433.5, "epoch": 0.5995011086474501, "grad_norm": 0.5124434232711792, "kl": 0.08123345673084259, "learning_rate": 4.8389348904150255e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2163 }, { "completion_length": 453.75, "epoch": 0.5997782705099778, "grad_norm": 0.3674982190132141, "kl": 0.08488666266202927, "learning_rate": 4.838780256842085e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2164 }, { "completion_length": 397.25, "epoch": 0.6000554323725056, "grad_norm": 0.0, "kl": 0.08357291668653488, "learning_rate": 4.838625551548482e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2165 }, { "completion_length": 442.25, "epoch": 0.6003325942350333, "grad_norm": 0.0, "kl": 0.08736824989318848, "learning_rate": 4.838470774538963e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2166 }, { "completion_length": 411.5, "epoch": 0.600609756097561, "grad_norm": 0.0, "kl": 0.09268944710493088, "learning_rate": 4.838315925818274e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2167 }, { "completion_length": 397.25, "epoch": 0.6008869179600886, "grad_norm": 0.0, "kl": 0.09606237709522247, "learning_rate": 4.838161005391161e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2168 }, { "completion_length": 430.5, "epoch": 0.6011640798226164, "grad_norm": 0.37743937969207764, "kl": 0.07211523503065109, "learning_rate": 4.8380060132623776e-06, "loss": 0.0, "reward": 2.625, "reward_std": 2.0966243743896484, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2169 }, { "completion_length": 463.25, "epoch": 0.6014412416851441, "grad_norm": 0.44438061118125916, "kl": 0.16457906365394592, "learning_rate": 4.837850949436676e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2170 }, { "completion_length": 397.75, "epoch": 0.6017184035476718, "grad_norm": 0.0, "kl": 0.08752241730690002, "learning_rate": 4.837695813918809e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2171 }, { "completion_length": 426.25, "epoch": 0.6019955654101996, "grad_norm": 0.0, "kl": 0.06998803466558456, "learning_rate": 4.837540606713538e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2172 }, { "completion_length": 529.5, "epoch": 0.6022727272727273, "grad_norm": 0.452722430229187, "kl": 0.12061357498168945, "learning_rate": 4.8373853278256186e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2173 }, { "completion_length": 439.0, "epoch": 0.602549889135255, "grad_norm": 0.381234347820282, "kl": 0.08876544237136841, "learning_rate": 4.837229977259816e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2174 }, { "completion_length": 393.25, "epoch": 0.6028270509977827, "grad_norm": 0.0, "kl": 0.09612533450126648, "learning_rate": 4.837074555020893e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2175 }, { "completion_length": 431.0, "epoch": 0.6031042128603105, "grad_norm": 0.0, "kl": 0.07944080233573914, "learning_rate": 4.836919061113614e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2176 }, { "completion_length": 371.5, "epoch": 0.6033813747228381, "grad_norm": 0.38768190145492554, "kl": 0.0818474069237709, "learning_rate": 4.836763495542749e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2177 }, { "completion_length": 380.75, "epoch": 0.6036585365853658, "grad_norm": 0.43825846910476685, "kl": 0.1079402044415474, "learning_rate": 4.836607858313068e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2178 }, { "completion_length": 401.0, "epoch": 0.6039356984478935, "grad_norm": 0.0, "kl": 0.09439828246831894, "learning_rate": 4.836452149429346e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2179 }, { "completion_length": 368.0, "epoch": 0.6042128603104213, "grad_norm": 0.0, "kl": 0.10168569535017014, "learning_rate": 4.836296368896354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2180 }, { "completion_length": 380.5, "epoch": 0.604490022172949, "grad_norm": 0.44121411442756653, "kl": 0.09026103466749191, "learning_rate": 4.836140516718872e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2181 }, { "completion_length": 419.5, "epoch": 0.6047671840354767, "grad_norm": 0.0, "kl": 0.08229769021272659, "learning_rate": 4.835984592901678e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2182 }, { "completion_length": 389.75, "epoch": 0.6050443458980045, "grad_norm": 1.1820068359375, "kl": 0.1010633111000061, "learning_rate": 4.835828597449554e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2183 }, { "completion_length": 468.75, "epoch": 0.6053215077605322, "grad_norm": 0.3305647671222687, "kl": 0.06896105408668518, "learning_rate": 4.835672530367285e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2184 }, { "completion_length": 416.0, "epoch": 0.6055986696230599, "grad_norm": 0.40278109908103943, "kl": 0.0784531682729721, "learning_rate": 4.8355163916596536e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2185 }, { "completion_length": 371.0, "epoch": 0.6058758314855875, "grad_norm": 0.6435714960098267, "kl": 0.0929100438952446, "learning_rate": 4.835360181331451e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2186 }, { "completion_length": 412.25, "epoch": 0.6061529933481153, "grad_norm": 0.41089802980422974, "kl": 0.07369040697813034, "learning_rate": 4.8352038993874665e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2187 }, { "completion_length": 411.5, "epoch": 0.606430155210643, "grad_norm": 0.4465278685092926, "kl": 0.0919799655675888, "learning_rate": 4.835047545832493e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2188 }, { "completion_length": 382.75, "epoch": 0.6067073170731707, "grad_norm": 0.43536147475242615, "kl": 0.09460947662591934, "learning_rate": 4.834891120671324e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2189 }, { "completion_length": 395.0, "epoch": 0.6069844789356984, "grad_norm": 0.41347408294677734, "kl": 0.0905301570892334, "learning_rate": 4.8347346239087575e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2190 }, { "completion_length": 356.0, "epoch": 0.6072616407982262, "grad_norm": 0.0, "kl": 0.09658676385879517, "learning_rate": 4.834578055549592e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2191 }, { "completion_length": 376.0, "epoch": 0.6075388026607539, "grad_norm": 0.0, "kl": 0.08957749605178833, "learning_rate": 4.8344214155986305e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2192 }, { "completion_length": 420.5, "epoch": 0.6078159645232816, "grad_norm": 0.4142471253871918, "kl": 0.08335249871015549, "learning_rate": 4.834264704060674e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2193 }, { "completion_length": 432.75, "epoch": 0.6080931263858093, "grad_norm": 0.3850663900375366, "kl": 0.07412425428628922, "learning_rate": 4.834107920940529e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2194 }, { "completion_length": 298.0, "epoch": 0.608370288248337, "grad_norm": 0.0, "kl": 0.10390432924032211, "learning_rate": 4.833951066243004e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2195 }, { "completion_length": 390.0, "epoch": 0.6086474501108647, "grad_norm": 0.0, "kl": 0.10290597379207611, "learning_rate": 4.83379413997291e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2196 }, { "completion_length": 346.0, "epoch": 0.6089246119733924, "grad_norm": 0.0, "kl": 0.09766188263893127, "learning_rate": 4.833637142135057e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2197 }, { "completion_length": 324.5, "epoch": 0.6092017738359202, "grad_norm": 0.0, "kl": 0.09237802028656006, "learning_rate": 4.833480072734261e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2198 }, { "completion_length": 412.25, "epoch": 0.6094789356984479, "grad_norm": 0.39147844910621643, "kl": 0.07742352783679962, "learning_rate": 4.833322931775337e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2199 }, { "completion_length": 355.0, "epoch": 0.6097560975609756, "grad_norm": 0.6126692295074463, "kl": 0.09079945832490921, "learning_rate": 4.833165719263107e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2200 }, { "completion_length": 412.0, "epoch": 0.6100332594235033, "grad_norm": 0.41250941157341003, "kl": 0.08452283591032028, "learning_rate": 4.833008435202389e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2201 }, { "completion_length": 340.5, "epoch": 0.6103104212860311, "grad_norm": 0.48502397537231445, "kl": 0.0690758153796196, "learning_rate": 4.832851079598007e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2202 }, { "completion_length": 470.5, "epoch": 0.6105875831485588, "grad_norm": 0.3992123603820801, "kl": 0.08257990330457687, "learning_rate": 4.832693652454787e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2203 }, { "completion_length": 379.75, "epoch": 0.6108647450110865, "grad_norm": 0.0, "kl": 0.09826812893152237, "learning_rate": 4.832536153777557e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2204 }, { "completion_length": 447.25, "epoch": 0.6111419068736141, "grad_norm": 0.38739240169525146, "kl": 0.07965131849050522, "learning_rate": 4.8323785835711454e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2205 }, { "completion_length": 458.25, "epoch": 0.6114190687361419, "grad_norm": 0.0, "kl": 0.06780904531478882, "learning_rate": 4.832220941840386e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2206 }, { "completion_length": 365.25, "epoch": 0.6116962305986696, "grad_norm": 0.0, "kl": 0.08656688034534454, "learning_rate": 4.832063228590112e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2207 }, { "completion_length": 410.0, "epoch": 0.6119733924611973, "grad_norm": 0.37751302123069763, "kl": 0.09538718312978745, "learning_rate": 4.83190544382516e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2208 }, { "completion_length": 351.75, "epoch": 0.6122505543237251, "grad_norm": 0.4750109314918518, "kl": 0.09394862502813339, "learning_rate": 4.831747587550368e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2209 }, { "completion_length": 439.0, "epoch": 0.6125277161862528, "grad_norm": 0.3715995252132416, "kl": 0.07161372900009155, "learning_rate": 4.831589659770577e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2210 }, { "completion_length": 379.25, "epoch": 0.6128048780487805, "grad_norm": 0.0, "kl": 0.08920276910066605, "learning_rate": 4.831431660490631e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2211 }, { "completion_length": 411.75, "epoch": 0.6130820399113082, "grad_norm": 0.5210185050964355, "kl": 0.08063196390867233, "learning_rate": 4.831273589715374e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2212 }, { "completion_length": 365.0, "epoch": 0.613359201773836, "grad_norm": 0.5281893014907837, "kl": 1.0400309562683105, "learning_rate": 4.831115447449654e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2213 }, { "completion_length": 454.5, "epoch": 0.6136363636363636, "grad_norm": 0.41053399443626404, "kl": 0.06381046772003174, "learning_rate": 4.830957233698321e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2214 }, { "completion_length": 404.0, "epoch": 0.6139135254988913, "grad_norm": 0.4724915027618408, "kl": 0.08380512148141861, "learning_rate": 4.830798948466226e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2215 }, { "completion_length": 373.5, "epoch": 0.614190687361419, "grad_norm": 0.0, "kl": 0.09528902173042297, "learning_rate": 4.830640591758223e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2216 }, { "completion_length": 448.75, "epoch": 0.6144678492239468, "grad_norm": 0.3848598599433899, "kl": 0.0972680076956749, "learning_rate": 4.830482163579168e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2217 }, { "completion_length": 318.75, "epoch": 0.6147450110864745, "grad_norm": 0.47208428382873535, "kl": 0.13001632690429688, "learning_rate": 4.83032366393392e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2218 }, { "completion_length": 390.5, "epoch": 0.6150221729490022, "grad_norm": 0.7620750665664673, "kl": 0.0882563591003418, "learning_rate": 4.830165092827339e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2219 }, { "completion_length": 352.75, "epoch": 0.61529933481153, "grad_norm": 0.3915790319442749, "kl": 0.10822771489620209, "learning_rate": 4.830006450264288e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2220 }, { "completion_length": 380.0, "epoch": 0.6155764966740577, "grad_norm": 0.4173407554626465, "kl": 0.09650252759456635, "learning_rate": 4.829847736249631e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2221 }, { "completion_length": 419.0, "epoch": 0.6158536585365854, "grad_norm": 0.0, "kl": 0.09023994207382202, "learning_rate": 4.829688950788237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2222 }, { "completion_length": 404.75, "epoch": 0.616130820399113, "grad_norm": 0.3861555755138397, "kl": 0.15688373148441315, "learning_rate": 4.829530093884973e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2223 }, { "completion_length": 384.0, "epoch": 0.6164079822616408, "grad_norm": 0.45814505219459534, "kl": 0.10039325058460236, "learning_rate": 4.829371165544713e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2224 }, { "completion_length": 400.75, "epoch": 0.6166851441241685, "grad_norm": 0.4139263331890106, "kl": 0.10618501156568527, "learning_rate": 4.829212165772328e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2225 }, { "completion_length": 411.5, "epoch": 0.6169623059866962, "grad_norm": 0.5085843801498413, "kl": 0.09948696196079254, "learning_rate": 4.8290530945726954e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2226 }, { "completion_length": 378.25, "epoch": 0.6172394678492239, "grad_norm": 0.0, "kl": 0.08269242197275162, "learning_rate": 4.828893951950693e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2227 }, { "completion_length": 380.0, "epoch": 0.6175166297117517, "grad_norm": 0.5112580060958862, "kl": 0.06997829675674438, "learning_rate": 4.828734737911202e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2228 }, { "completion_length": 418.25, "epoch": 0.6177937915742794, "grad_norm": 0.4236229658126831, "kl": 0.08692163228988647, "learning_rate": 4.828575452459102e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2229 }, { "completion_length": 406.0, "epoch": 0.6180709534368071, "grad_norm": 0.0, "kl": 0.0761985033750534, "learning_rate": 4.828416095599281e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2230 }, { "completion_length": 385.75, "epoch": 0.6183481152993349, "grad_norm": 0.4273446798324585, "kl": 0.11622712016105652, "learning_rate": 4.828256667336624e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2231 }, { "completion_length": 459.5, "epoch": 0.6186252771618626, "grad_norm": 0.366851806640625, "kl": 0.07509475201368332, "learning_rate": 4.82809716767602e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2232 }, { "completion_length": 329.75, "epoch": 0.6189024390243902, "grad_norm": 0.0, "kl": 0.10283011198043823, "learning_rate": 4.827937596622362e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2233 }, { "completion_length": 411.0, "epoch": 0.6191796008869179, "grad_norm": 0.41898778080940247, "kl": 0.10081186890602112, "learning_rate": 4.82777795418054e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2234 }, { "completion_length": 396.75, "epoch": 0.6194567627494457, "grad_norm": 0.0, "kl": 0.07081601023674011, "learning_rate": 4.8276182403554526e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2235 }, { "completion_length": 428.75, "epoch": 0.6197339246119734, "grad_norm": 0.0, "kl": 0.08387622982263565, "learning_rate": 4.8274584551519955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2236 }, { "completion_length": 412.5, "epoch": 0.6200110864745011, "grad_norm": 0.38036203384399414, "kl": 0.07697517424821854, "learning_rate": 4.827298598575071e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2237 }, { "completion_length": 410.0, "epoch": 0.6202882483370288, "grad_norm": 0.0, "kl": 0.07289716601371765, "learning_rate": 4.827138670629578e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2238 }, { "completion_length": 398.75, "epoch": 0.6205654101995566, "grad_norm": 0.4866851270198822, "kl": 0.07308990508317947, "learning_rate": 4.826978671320424e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2239 }, { "completion_length": 379.25, "epoch": 0.6208425720620843, "grad_norm": 0.3949616551399231, "kl": 0.08068658411502838, "learning_rate": 4.826818600652513e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2240 }, { "completion_length": 376.75, "epoch": 0.621119733924612, "grad_norm": 0.46250128746032715, "kl": 0.08991623669862747, "learning_rate": 4.8266584586307555e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2241 }, { "completion_length": 407.75, "epoch": 0.6213968957871396, "grad_norm": 0.3994554877281189, "kl": 0.0849810466170311, "learning_rate": 4.826498245260062e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2242 }, { "completion_length": 367.0, "epoch": 0.6216740576496674, "grad_norm": 0.41773080825805664, "kl": 0.09778239578008652, "learning_rate": 4.8263379605453456e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2243 }, { "completion_length": 441.0, "epoch": 0.6219512195121951, "grad_norm": 0.4226345717906952, "kl": 0.07974983006715775, "learning_rate": 4.82617760449152e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2244 }, { "completion_length": 371.0, "epoch": 0.6222283813747228, "grad_norm": 0.43791013956069946, "kl": 0.10403221100568771, "learning_rate": 4.826017177103505e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2245 }, { "completion_length": 395.0, "epoch": 0.6225055432372506, "grad_norm": 0.0, "kl": 0.08574887365102768, "learning_rate": 4.825856678386218e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2246 }, { "completion_length": 452.5, "epoch": 0.6227827050997783, "grad_norm": 0.0, "kl": 0.07390382140874863, "learning_rate": 4.825696108344583e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2247 }, { "completion_length": 368.5, "epoch": 0.623059866962306, "grad_norm": 0.0, "kl": 0.08090793341398239, "learning_rate": 4.825535466983523e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2248 }, { "completion_length": 442.5, "epoch": 0.6233370288248337, "grad_norm": 0.41859161853790283, "kl": 0.06871917098760605, "learning_rate": 4.825374754307964e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2249 }, { "completion_length": 371.5, "epoch": 0.6236141906873615, "grad_norm": 0.0, "kl": 0.08679848164319992, "learning_rate": 4.825213970322835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2250 }, { "completion_length": 439.75, "epoch": 0.6238913525498891, "grad_norm": 0.0, "kl": 0.09451106935739517, "learning_rate": 4.825053115033066e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2251 }, { "completion_length": 399.75, "epoch": 0.6241685144124168, "grad_norm": 0.0, "kl": 0.33000972867012024, "learning_rate": 4.82489218844359e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2252 }, { "completion_length": 416.25, "epoch": 0.6244456762749445, "grad_norm": 0.5199065804481506, "kl": 0.16540122032165527, "learning_rate": 4.824731190559341e-06, "loss": 0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2253 }, { "completion_length": 349.5, "epoch": 0.6247228381374723, "grad_norm": 0.5588317513465881, "kl": 0.10006768256425858, "learning_rate": 4.824570121385258e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2254 }, { "completion_length": 376.5, "epoch": 0.625, "grad_norm": 0.0, "kl": 0.12251950800418854, "learning_rate": 4.824408980926279e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2255 }, { "completion_length": 490.75, "epoch": 0.6252771618625277, "grad_norm": 0.34730035066604614, "kl": 0.08098164945840836, "learning_rate": 4.824247769187345e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2256 }, { "completion_length": 364.75, "epoch": 0.6255543237250555, "grad_norm": 0.0, "kl": 0.07166342437267303, "learning_rate": 4.824086486173402e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2257 }, { "completion_length": 400.75, "epoch": 0.6258314855875832, "grad_norm": 0.0, "kl": 0.06943422555923462, "learning_rate": 4.823925131889393e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2258 }, { "completion_length": 400.25, "epoch": 0.6261086474501109, "grad_norm": 0.44783779978752136, "kl": 0.08362576365470886, "learning_rate": 4.823763706340269e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2259 }, { "completion_length": 433.75, "epoch": 0.6263858093126385, "grad_norm": 0.0, "kl": 0.08100445568561554, "learning_rate": 4.823602209530978e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2260 }, { "completion_length": 374.0, "epoch": 0.6266629711751663, "grad_norm": 0.498562753200531, "kl": 0.1144787073135376, "learning_rate": 4.8234406414664735e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2261 }, { "completion_length": 423.75, "epoch": 0.626940133037694, "grad_norm": 0.3756246268749237, "kl": 0.0825212150812149, "learning_rate": 4.8232790021517094e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2262 }, { "completion_length": 407.0, "epoch": 0.6272172949002217, "grad_norm": 0.0, "kl": 0.10064508020877838, "learning_rate": 4.823117291591644e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2263 }, { "completion_length": 359.0, "epoch": 0.6274944567627494, "grad_norm": 0.0, "kl": 0.08824969828128815, "learning_rate": 4.8229555097912335e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2264 }, { "completion_length": 408.75, "epoch": 0.6277716186252772, "grad_norm": 0.0, "kl": 0.0845467746257782, "learning_rate": 4.822793656755441e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2265 }, { "completion_length": 378.5, "epoch": 0.6280487804878049, "grad_norm": 0.0, "kl": 0.09376949816942215, "learning_rate": 4.822631732489231e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2266 }, { "completion_length": 383.0, "epoch": 0.6283259423503326, "grad_norm": 0.4835580587387085, "kl": 0.08930260688066483, "learning_rate": 4.822469736997566e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2267 }, { "completion_length": 369.75, "epoch": 0.6286031042128604, "grad_norm": 0.4476292133331299, "kl": 0.18442478775978088, "learning_rate": 4.822307670285417e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2268 }, { "completion_length": 361.25, "epoch": 0.628880266075388, "grad_norm": 0.48595747351646423, "kl": 0.09233783930540085, "learning_rate": 4.8221455323577515e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2269 }, { "completion_length": 416.75, "epoch": 0.6291574279379157, "grad_norm": 0.4633837044239044, "kl": 0.09441784769296646, "learning_rate": 4.821983323219542e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2270 }, { "completion_length": 397.5, "epoch": 0.6294345898004434, "grad_norm": 0.49419745802879333, "kl": 0.07152225822210312, "learning_rate": 4.8218210428757635e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2271 }, { "completion_length": 455.5, "epoch": 0.6297117516629712, "grad_norm": 0.0, "kl": 0.08504863828420639, "learning_rate": 4.8216586913313924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2272 }, { "completion_length": 459.0, "epoch": 0.6299889135254989, "grad_norm": 0.3796127736568451, "kl": 0.10081274062395096, "learning_rate": 4.821496268591408e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2273 }, { "completion_length": 365.25, "epoch": 0.6302660753880266, "grad_norm": 0.0, "kl": 0.08249808102846146, "learning_rate": 4.821333774660789e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2274 }, { "completion_length": 439.25, "epoch": 0.6305432372505543, "grad_norm": 1.078192114830017, "kl": 0.3360956609249115, "learning_rate": 4.82117120954452e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2275 }, { "completion_length": 404.75, "epoch": 0.6308203991130821, "grad_norm": 0.4133490324020386, "kl": 0.3143353760242462, "learning_rate": 4.821008573247585e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2276 }, { "completion_length": 441.75, "epoch": 0.6310975609756098, "grad_norm": 0.3332699239253998, "kl": 0.08198325335979462, "learning_rate": 4.820845865774973e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2277 }, { "completion_length": 439.25, "epoch": 0.6313747228381374, "grad_norm": 0.4019729197025299, "kl": 0.07667096704244614, "learning_rate": 4.8206830871316726e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2278 }, { "completion_length": 422.0, "epoch": 0.6316518847006651, "grad_norm": 0.5316561460494995, "kl": 0.11716204881668091, "learning_rate": 4.820520237322676e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2279 }, { "completion_length": 456.75, "epoch": 0.6319290465631929, "grad_norm": 0.3912135064601898, "kl": 0.07701128721237183, "learning_rate": 4.820357316352977e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2280 }, { "completion_length": 444.75, "epoch": 0.6322062084257206, "grad_norm": 0.0, "kl": 0.08395484834909439, "learning_rate": 4.820194324227571e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2281 }, { "completion_length": 430.25, "epoch": 0.6324833702882483, "grad_norm": 0.0, "kl": 0.0885709747672081, "learning_rate": 4.820031260951457e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2282 }, { "completion_length": 388.5, "epoch": 0.6327605321507761, "grad_norm": 0.47915369272232056, "kl": 0.10636480152606964, "learning_rate": 4.819868126529635e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2283 }, { "completion_length": 382.25, "epoch": 0.6330376940133038, "grad_norm": 0.0, "kl": 0.09759742766618729, "learning_rate": 4.819704920967109e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2284 }, { "completion_length": 357.5, "epoch": 0.6333148558758315, "grad_norm": 0.0, "kl": 0.08899439871311188, "learning_rate": 4.819541644268881e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2285 }, { "completion_length": 514.0, "epoch": 0.6335920177383592, "grad_norm": 0.3837408721446991, "kl": 0.09060537070035934, "learning_rate": 4.819378296439962e-06, "loss": -0.0, "reward": 3.875, "reward_std": 2.1746647357940674, "rewards/confident_score_func": 1.125, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2286 }, { "completion_length": 416.5, "epoch": 0.633869179600887, "grad_norm": 0.3766602873802185, "kl": 0.07730740308761597, "learning_rate": 4.8192148774853575e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2287 }, { "completion_length": 384.0, "epoch": 0.6341463414634146, "grad_norm": 0.0, "kl": 0.11158870160579681, "learning_rate": 4.819051387410081e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2288 }, { "completion_length": 343.75, "epoch": 0.6344235033259423, "grad_norm": 0.0, "kl": 0.11100009083747864, "learning_rate": 4.8188878262191455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2289 }, { "completion_length": 442.0, "epoch": 0.63470066518847, "grad_norm": 0.3916703462600708, "kl": 0.07841671258211136, "learning_rate": 4.818724193917566e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2290 }, { "completion_length": 405.0, "epoch": 0.6349778270509978, "grad_norm": 0.0, "kl": 0.10568458586931229, "learning_rate": 4.818560490510362e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2291 }, { "completion_length": 467.75, "epoch": 0.6352549889135255, "grad_norm": 0.41236257553100586, "kl": 0.1867004930973053, "learning_rate": 4.818396716002553e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2292 }, { "completion_length": 312.25, "epoch": 0.6355321507760532, "grad_norm": 0.0, "kl": 0.08837425708770752, "learning_rate": 4.81823287039916e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2293 }, { "completion_length": 428.0, "epoch": 0.635809312638581, "grad_norm": 0.4522424638271332, "kl": 0.0919090062379837, "learning_rate": 4.818068953705209e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2294 }, { "completion_length": 407.75, "epoch": 0.6360864745011087, "grad_norm": 0.0, "kl": 0.10579009354114532, "learning_rate": 4.817904965925726e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2295 }, { "completion_length": 466.25, "epoch": 0.6363636363636364, "grad_norm": 0.36898210644721985, "kl": 0.08561345189809799, "learning_rate": 4.81774090706574e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2296 }, { "completion_length": 400.5, "epoch": 0.636640798226164, "grad_norm": 0.4122694432735443, "kl": 0.09360263496637344, "learning_rate": 4.8175767771302815e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2297 }, { "completion_length": 387.0, "epoch": 0.6369179600886918, "grad_norm": 0.41535332798957825, "kl": 0.09096702933311462, "learning_rate": 4.817412576124385e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2298 }, { "completion_length": 402.5, "epoch": 0.6371951219512195, "grad_norm": 0.3683679699897766, "kl": 0.09098543226718903, "learning_rate": 4.8172483040530845e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2299 }, { "completion_length": 437.75, "epoch": 0.6374722838137472, "grad_norm": 0.0, "kl": 0.08842942863702774, "learning_rate": 4.817083960921418e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2300 }, { "completion_length": 360.75, "epoch": 0.6377494456762749, "grad_norm": 0.5467607378959656, "kl": 0.08507934957742691, "learning_rate": 4.816919546734426e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2301 }, { "completion_length": 472.0, "epoch": 0.6380266075388027, "grad_norm": 0.7474622130393982, "kl": 0.08448350429534912, "learning_rate": 4.816755061497148e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2302 }, { "completion_length": 423.25, "epoch": 0.6383037694013304, "grad_norm": 0.41558703780174255, "kl": 0.07487901300191879, "learning_rate": 4.816590505214631e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2303 }, { "completion_length": 386.0, "epoch": 0.6385809312638581, "grad_norm": 0.0, "kl": 0.11242664605379105, "learning_rate": 4.81642587789192e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2304 }, { "completion_length": 365.5, "epoch": 0.6388580931263859, "grad_norm": 0.0, "kl": 0.09746406227350235, "learning_rate": 4.816261179534063e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2305 }, { "completion_length": 368.25, "epoch": 0.6391352549889135, "grad_norm": 0.0, "kl": 0.07945965975522995, "learning_rate": 4.8160964101461105e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2306 }, { "completion_length": 395.0, "epoch": 0.6394124168514412, "grad_norm": 0.4989100694656372, "kl": 0.07095380872488022, "learning_rate": 4.815931569733117e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2307 }, { "completion_length": 454.0, "epoch": 0.6396895787139689, "grad_norm": 0.41376444697380066, "kl": 0.09681357443332672, "learning_rate": 4.815766658300135e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2308 }, { "completion_length": 415.25, "epoch": 0.6399667405764967, "grad_norm": 0.45036226511001587, "kl": 0.0808383971452713, "learning_rate": 4.815601675852224e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2309 }, { "completion_length": 447.5, "epoch": 0.6402439024390244, "grad_norm": 0.4382658004760742, "kl": 0.10624098777770996, "learning_rate": 4.815436622394442e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2310 }, { "completion_length": 469.0, "epoch": 0.6405210643015521, "grad_norm": 0.39163437485694885, "kl": 0.09677761048078537, "learning_rate": 4.81527149793185e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2311 }, { "completion_length": 490.0, "epoch": 0.6407982261640798, "grad_norm": 0.0, "kl": 0.07912631332874298, "learning_rate": 4.815106302469513e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2312 }, { "completion_length": 376.75, "epoch": 0.6410753880266076, "grad_norm": 0.0, "kl": 0.10518958419561386, "learning_rate": 4.814941036012497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2313 }, { "completion_length": 431.75, "epoch": 0.6413525498891353, "grad_norm": 0.0, "kl": 0.08211927860975266, "learning_rate": 4.814775698565869e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2314 }, { "completion_length": 350.0, "epoch": 0.641629711751663, "grad_norm": 0.4612126052379608, "kl": 0.09738625586032867, "learning_rate": 4.814610290134699e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2315 }, { "completion_length": 418.5, "epoch": 0.6419068736141907, "grad_norm": 0.0, "kl": 0.10191328823566437, "learning_rate": 4.8144448107240604e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2316 }, { "completion_length": 381.0, "epoch": 0.6421840354767184, "grad_norm": 0.4526805877685547, "kl": 0.1061779260635376, "learning_rate": 4.814279260339027e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2317 }, { "completion_length": 398.0, "epoch": 0.6424611973392461, "grad_norm": 0.4698142409324646, "kl": 0.08615991473197937, "learning_rate": 4.814113638984676e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2318 }, { "completion_length": 368.75, "epoch": 0.6427383592017738, "grad_norm": 0.0, "kl": 0.11694981902837753, "learning_rate": 4.813947946666086e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2319 }, { "completion_length": 416.0, "epoch": 0.6430155210643016, "grad_norm": 0.0, "kl": 0.27133768796920776, "learning_rate": 4.813782183388339e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2320 }, { "completion_length": 412.5, "epoch": 0.6432926829268293, "grad_norm": 0.0, "kl": 0.10642021149396896, "learning_rate": 4.813616349156517e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2321 }, { "completion_length": 435.5, "epoch": 0.643569844789357, "grad_norm": 0.42984530329704285, "kl": 0.0976809561252594, "learning_rate": 4.813450443975705e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2322 }, { "completion_length": 380.75, "epoch": 0.6438470066518847, "grad_norm": 0.0, "kl": 0.10978364944458008, "learning_rate": 4.8132844678509935e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2323 }, { "completion_length": 456.0, "epoch": 0.6441241685144125, "grad_norm": 0.0, "kl": 0.15322959423065186, "learning_rate": 4.81311842078747e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2324 }, { "completion_length": 429.75, "epoch": 0.6444013303769401, "grad_norm": 0.4156922399997711, "kl": 0.1127401664853096, "learning_rate": 4.812952302790226e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2325 }, { "completion_length": 467.75, "epoch": 0.6446784922394678, "grad_norm": 0.36109891533851624, "kl": 0.0874846801161766, "learning_rate": 4.812786113864356e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2326 }, { "completion_length": 372.0, "epoch": 0.6449556541019955, "grad_norm": 0.0, "kl": 0.09305670857429504, "learning_rate": 4.812619854014958e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2327 }, { "completion_length": 406.25, "epoch": 0.6452328159645233, "grad_norm": 0.38110968470573425, "kl": 0.10080117732286453, "learning_rate": 4.81245352324713e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2328 }, { "completion_length": 431.5, "epoch": 0.645509977827051, "grad_norm": 0.402607798576355, "kl": 0.08407770842313766, "learning_rate": 4.81228712156597e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2329 }, { "completion_length": 391.25, "epoch": 0.6457871396895787, "grad_norm": 0.4416881799697876, "kl": 0.07775174826383591, "learning_rate": 4.812120648976584e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2330 }, { "completion_length": 438.25, "epoch": 0.6460643015521065, "grad_norm": 0.3745209872722626, "kl": 0.07690268009901047, "learning_rate": 4.811954105484076e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2331 }, { "completion_length": 443.25, "epoch": 0.6463414634146342, "grad_norm": 0.3757140338420868, "kl": 0.07575144618749619, "learning_rate": 4.811787491093552e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2332 }, { "completion_length": 409.5, "epoch": 0.6466186252771619, "grad_norm": 0.4891943335533142, "kl": 0.09373366087675095, "learning_rate": 4.811620805810124e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2333 }, { "completion_length": 402.25, "epoch": 0.6468957871396895, "grad_norm": 0.44194287061691284, "kl": 0.09571236371994019, "learning_rate": 4.811454049638901e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2334 }, { "completion_length": 356.5, "epoch": 0.6471729490022173, "grad_norm": 0.0, "kl": 0.1068028137087822, "learning_rate": 4.811287222584999e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2335 }, { "completion_length": 405.75, "epoch": 0.647450110864745, "grad_norm": 0.0, "kl": 0.775745689868927, "learning_rate": 4.811120324653531e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2336 }, { "completion_length": 445.5, "epoch": 0.6477272727272727, "grad_norm": 0.44071900844573975, "kl": 0.07857426255941391, "learning_rate": 4.810953355849617e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2337 }, { "completion_length": 450.75, "epoch": 0.6480044345898004, "grad_norm": 0.47518685460090637, "kl": 0.10473766177892685, "learning_rate": 4.810786316178377e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2338 }, { "completion_length": 492.75, "epoch": 0.6482815964523282, "grad_norm": 0.4268428683280945, "kl": 0.08847658336162567, "learning_rate": 4.810619205644934e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2339 }, { "completion_length": 438.5, "epoch": 0.6485587583148559, "grad_norm": 0.4754917621612549, "kl": 0.09347332268953323, "learning_rate": 4.810452024254411e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2340 }, { "completion_length": 490.25, "epoch": 0.6488359201773836, "grad_norm": 0.0, "kl": 0.09721893072128296, "learning_rate": 4.810284772011936e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2341 }, { "completion_length": 478.25, "epoch": 0.6491130820399114, "grad_norm": 0.0, "kl": 0.11267387866973877, "learning_rate": 4.810117448922638e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2342 }, { "completion_length": 425.25, "epoch": 0.649390243902439, "grad_norm": 0.39837104082107544, "kl": 0.09711050242185593, "learning_rate": 4.809950054991647e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2343 }, { "completion_length": 378.75, "epoch": 0.6496674057649667, "grad_norm": 0.0, "kl": 0.09421608597040176, "learning_rate": 4.809782590224097e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2344 }, { "completion_length": 404.5, "epoch": 0.6499445676274944, "grad_norm": 0.0, "kl": 0.08434249460697174, "learning_rate": 4.809615054625124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2345 }, { "completion_length": 492.75, "epoch": 0.6502217294900222, "grad_norm": 0.4177391529083252, "kl": 0.17010511457920074, "learning_rate": 4.8094474481998644e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2346 }, { "completion_length": 385.0, "epoch": 0.6504988913525499, "grad_norm": 0.4022493362426758, "kl": 0.10493117570877075, "learning_rate": 4.809279770953459e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2347 }, { "completion_length": 411.0, "epoch": 0.6507760532150776, "grad_norm": 0.0, "kl": 0.12038018554449081, "learning_rate": 4.809112022891049e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2348 }, { "completion_length": 416.5, "epoch": 0.6510532150776053, "grad_norm": 0.42504188418388367, "kl": 0.07898851484060287, "learning_rate": 4.808944204017779e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2349 }, { "completion_length": 411.25, "epoch": 0.6513303769401331, "grad_norm": 0.5305551886558533, "kl": 0.10675065219402313, "learning_rate": 4.808776314338796e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2350 }, { "completion_length": 430.25, "epoch": 0.6516075388026608, "grad_norm": 0.47301316261291504, "kl": 0.07864373922348022, "learning_rate": 4.808608353859247e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2351 }, { "completion_length": 477.5, "epoch": 0.6518847006651884, "grad_norm": 0.3623493015766144, "kl": 0.09067756682634354, "learning_rate": 4.808440322584283e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2352 }, { "completion_length": 463.0, "epoch": 0.6521618625277162, "grad_norm": 0.4479110836982727, "kl": 0.11523682624101639, "learning_rate": 4.808272220519058e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2353 }, { "completion_length": 453.75, "epoch": 0.6524390243902439, "grad_norm": 0.39072489738464355, "kl": 0.1115868017077446, "learning_rate": 4.808104047668727e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2354 }, { "completion_length": 428.0, "epoch": 0.6527161862527716, "grad_norm": 0.3951628804206848, "kl": 0.09062030911445618, "learning_rate": 4.807935804038445e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2355 }, { "completion_length": 400.0, "epoch": 0.6529933481152993, "grad_norm": 0.45963773131370544, "kl": 0.09972993284463882, "learning_rate": 4.807767489633372e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2356 }, { "completion_length": 412.5, "epoch": 0.6532705099778271, "grad_norm": 0.0, "kl": 0.09718511253595352, "learning_rate": 4.807599104458671e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2357 }, { "completion_length": 397.25, "epoch": 0.6535476718403548, "grad_norm": 0.5148431658744812, "kl": 0.17115157842636108, "learning_rate": 4.807430648519506e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2358 }, { "completion_length": 427.75, "epoch": 0.6538248337028825, "grad_norm": 0.0, "kl": 0.09187474846839905, "learning_rate": 4.80726212182104e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2359 }, { "completion_length": 505.0, "epoch": 0.6541019955654102, "grad_norm": 0.4037743806838989, "kl": 0.06682705134153366, "learning_rate": 4.807093524368443e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2360 }, { "completion_length": 378.5, "epoch": 0.654379157427938, "grad_norm": 0.45108672976493835, "kl": 0.09533297270536423, "learning_rate": 4.806924856166886e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2361 }, { "completion_length": 420.25, "epoch": 0.6546563192904656, "grad_norm": 0.0, "kl": 0.12595757842063904, "learning_rate": 4.806756117221539e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2362 }, { "completion_length": 345.75, "epoch": 0.6549334811529933, "grad_norm": 0.0, "kl": 0.12025823444128036, "learning_rate": 4.806587307537578e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2363 }, { "completion_length": 427.75, "epoch": 0.655210643015521, "grad_norm": 0.46081089973449707, "kl": 0.09373418241739273, "learning_rate": 4.80641842712018e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2364 }, { "completion_length": 438.25, "epoch": 0.6554878048780488, "grad_norm": 0.4610963463783264, "kl": 0.09367596358060837, "learning_rate": 4.806249475974522e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2365 }, { "completion_length": 382.25, "epoch": 0.6557649667405765, "grad_norm": 0.6345430016517639, "kl": 0.09717914462089539, "learning_rate": 4.8060804541057886e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2366 }, { "completion_length": 406.5, "epoch": 0.6560421286031042, "grad_norm": 0.4515922963619232, "kl": 0.10909504443407059, "learning_rate": 4.805911361519159e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2367 }, { "completion_length": 418.5, "epoch": 0.656319290465632, "grad_norm": 0.0, "kl": 0.09662417322397232, "learning_rate": 4.805742198219821e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2368 }, { "completion_length": 461.5, "epoch": 0.6565964523281597, "grad_norm": 0.0, "kl": 0.10151102393865585, "learning_rate": 4.80557296421296e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2369 }, { "completion_length": 418.5, "epoch": 0.6568736141906873, "grad_norm": 0.4189237356185913, "kl": 0.10286587476730347, "learning_rate": 4.805403659503769e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2370 }, { "completion_length": 380.25, "epoch": 0.657150776053215, "grad_norm": 0.0, "kl": 0.12101428955793381, "learning_rate": 4.805234284097437e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2371 }, { "completion_length": 428.25, "epoch": 0.6574279379157428, "grad_norm": 0.0, "kl": 0.09825007617473602, "learning_rate": 4.805064837999159e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2372 }, { "completion_length": 398.0, "epoch": 0.6577050997782705, "grad_norm": 0.0, "kl": 0.1444358229637146, "learning_rate": 4.8048953212141314e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2373 }, { "completion_length": 391.5, "epoch": 0.6579822616407982, "grad_norm": 0.0, "kl": 0.19808612763881683, "learning_rate": 4.804725733747553e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2374 }, { "completion_length": 395.5, "epoch": 0.6582594235033259, "grad_norm": 0.0, "kl": 0.09704709053039551, "learning_rate": 4.804556075604623e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2375 }, { "completion_length": 455.0, "epoch": 0.6585365853658537, "grad_norm": 0.41332313418388367, "kl": 0.1679326444864273, "learning_rate": 4.804386346790545e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2376 }, { "completion_length": 350.75, "epoch": 0.6588137472283814, "grad_norm": 0.0, "kl": 0.1018766239285469, "learning_rate": 4.804216547310524e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2377 }, { "completion_length": 387.25, "epoch": 0.6590909090909091, "grad_norm": 0.0, "kl": 0.11364524066448212, "learning_rate": 4.804046677169767e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2378 }, { "completion_length": 423.5, "epoch": 0.6593680709534369, "grad_norm": 0.0, "kl": 0.10334087908267975, "learning_rate": 4.803876736373483e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2379 }, { "completion_length": 352.25, "epoch": 0.6596452328159645, "grad_norm": 0.0, "kl": 0.12973323464393616, "learning_rate": 4.803706724926883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2380 }, { "completion_length": 494.75, "epoch": 0.6599223946784922, "grad_norm": 0.0, "kl": 0.10601973533630371, "learning_rate": 4.803536642835181e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2381 }, { "completion_length": 461.0, "epoch": 0.6601995565410199, "grad_norm": 0.0, "kl": 0.08605995029211044, "learning_rate": 4.803366490103593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2382 }, { "completion_length": 326.0, "epoch": 0.6604767184035477, "grad_norm": 0.5412529110908508, "kl": 0.14985620975494385, "learning_rate": 4.803196266737335e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2383 }, { "completion_length": 439.75, "epoch": 0.6607538802660754, "grad_norm": 0.47176656126976013, "kl": 0.13294248282909393, "learning_rate": 4.803025972741631e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2384 }, { "completion_length": 421.75, "epoch": 0.6610310421286031, "grad_norm": 0.0, "kl": 0.10935384780168533, "learning_rate": 4.802855608121699e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2385 }, { "completion_length": 371.5, "epoch": 0.6613082039911308, "grad_norm": 0.0, "kl": 0.09839347004890442, "learning_rate": 4.8026851728827665e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2386 }, { "completion_length": 395.0, "epoch": 0.6615853658536586, "grad_norm": 0.0, "kl": 0.10190580785274506, "learning_rate": 4.802514667030057e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2387 }, { "completion_length": 402.25, "epoch": 0.6618625277161863, "grad_norm": 0.5494387745857239, "kl": 0.08967026323080063, "learning_rate": 4.8023440905688015e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2388 }, { "completion_length": 461.75, "epoch": 0.6621396895787139, "grad_norm": 0.3895524740219116, "kl": 0.11557690054178238, "learning_rate": 4.802173443504231e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2389 }, { "completion_length": 459.0, "epoch": 0.6624168514412417, "grad_norm": 0.0, "kl": 0.11338522285223007, "learning_rate": 4.802002725841577e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2390 }, { "completion_length": 440.75, "epoch": 0.6626940133037694, "grad_norm": 0.0, "kl": 0.10880238562822342, "learning_rate": 4.801831937586075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2391 }, { "completion_length": 397.75, "epoch": 0.6629711751662971, "grad_norm": 0.45621800422668457, "kl": 0.14823225140571594, "learning_rate": 4.8016610787429645e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2392 }, { "completion_length": 480.0, "epoch": 0.6632483370288248, "grad_norm": 0.40225356817245483, "kl": 0.07857147604227066, "learning_rate": 4.801490149317483e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2393 }, { "completion_length": 431.75, "epoch": 0.6635254988913526, "grad_norm": 0.0, "kl": 0.0958699956536293, "learning_rate": 4.801319149314872e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2394 }, { "completion_length": 414.75, "epoch": 0.6638026607538803, "grad_norm": 0.4276851415634155, "kl": 0.10853245109319687, "learning_rate": 4.801148078740376e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2395 }, { "completion_length": 389.25, "epoch": 0.664079822616408, "grad_norm": 0.5199501514434814, "kl": 0.6338328123092651, "learning_rate": 4.800976937599241e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2396 }, { "completion_length": 414.5, "epoch": 0.6643569844789357, "grad_norm": 0.0, "kl": 0.11970917880535126, "learning_rate": 4.800805725896715e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2397 }, { "completion_length": 383.5, "epoch": 0.6646341463414634, "grad_norm": 0.0, "kl": 0.12212855368852615, "learning_rate": 4.800634443638049e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2398 }, { "completion_length": 389.25, "epoch": 0.6649113082039911, "grad_norm": 0.4121502637863159, "kl": 0.1277087926864624, "learning_rate": 4.8004630908284945e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2399 }, { "completion_length": 468.25, "epoch": 0.6651884700665188, "grad_norm": 0.3973202705383301, "kl": 0.09373615682125092, "learning_rate": 4.800291667473307e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2400 }, { "completion_length": 440.5, "epoch": 0.6654656319290465, "grad_norm": 0.0, "kl": 0.09203606098890305, "learning_rate": 4.800120173577743e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2401 }, { "completion_length": 380.25, "epoch": 0.6657427937915743, "grad_norm": 0.0, "kl": 0.11012143641710281, "learning_rate": 4.799948609147061e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2402 }, { "completion_length": 451.75, "epoch": 0.666019955654102, "grad_norm": 0.4109841585159302, "kl": 0.09584180265665054, "learning_rate": 4.799776974186523e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2403 }, { "completion_length": 442.0, "epoch": 0.6662971175166297, "grad_norm": 0.41174548864364624, "kl": 0.13307823240756989, "learning_rate": 4.799605268701392e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2404 }, { "completion_length": 519.25, "epoch": 0.6665742793791575, "grad_norm": 0.37482014298439026, "kl": 0.09806432574987411, "learning_rate": 4.799433492696935e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2405 }, { "completion_length": 421.75, "epoch": 0.6668514412416852, "grad_norm": 0.46838656067848206, "kl": 0.08799239248037338, "learning_rate": 4.7992616461784156e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2406 }, { "completion_length": 400.75, "epoch": 0.6671286031042128, "grad_norm": 0.45827817916870117, "kl": 0.10021618008613586, "learning_rate": 4.799089729151107e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2407 }, { "completion_length": 372.0, "epoch": 0.6674057649667405, "grad_norm": 0.4265654683113098, "kl": 0.08227167278528214, "learning_rate": 4.798917741620281e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2408 }, { "completion_length": 387.5, "epoch": 0.6676829268292683, "grad_norm": 0.3889097273349762, "kl": 0.09586618840694427, "learning_rate": 4.798745683591211e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2409 }, { "completion_length": 392.75, "epoch": 0.667960088691796, "grad_norm": 0.45383039116859436, "kl": 0.11230072379112244, "learning_rate": 4.798573555069174e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2410 }, { "completion_length": 472.0, "epoch": 0.6682372505543237, "grad_norm": 0.3549756705760956, "kl": 0.10919195413589478, "learning_rate": 4.798401356059448e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2411 }, { "completion_length": 371.75, "epoch": 0.6685144124168514, "grad_norm": 0.0, "kl": 0.10336726158857346, "learning_rate": 4.798229086567312e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2412 }, { "completion_length": 429.0, "epoch": 0.6687915742793792, "grad_norm": 0.0, "kl": 0.08076634258031845, "learning_rate": 4.798056746598051e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2413 }, { "completion_length": 391.5, "epoch": 0.6690687361419069, "grad_norm": 0.0, "kl": 0.08997830748558044, "learning_rate": 4.79788433615695e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2414 }, { "completion_length": 361.0, "epoch": 0.6693458980044346, "grad_norm": 0.4650171101093292, "kl": 0.10559430718421936, "learning_rate": 4.797711855249295e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2415 }, { "completion_length": 388.25, "epoch": 0.6696230598669624, "grad_norm": 0.0, "kl": 0.3543970584869385, "learning_rate": 4.797539303880375e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2416 }, { "completion_length": 420.0, "epoch": 0.66990022172949, "grad_norm": 0.45601144433021545, "kl": 0.11405503004789352, "learning_rate": 4.797366682055482e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2417 }, { "completion_length": 407.75, "epoch": 0.6701773835920177, "grad_norm": 0.4556528627872467, "kl": 0.13554054498672485, "learning_rate": 4.797193989779911e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2418 }, { "completion_length": 398.5, "epoch": 0.6704545454545454, "grad_norm": 0.0, "kl": 0.10017798840999603, "learning_rate": 4.797021227058955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2419 }, { "completion_length": 478.25, "epoch": 0.6707317073170732, "grad_norm": 0.3443848788738251, "kl": 0.11728712916374207, "learning_rate": 4.796848393897914e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2420 }, { "completion_length": 484.0, "epoch": 0.6710088691796009, "grad_norm": 0.0, "kl": 0.08774635195732117, "learning_rate": 4.7966754903020875e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2421 }, { "completion_length": 426.5, "epoch": 0.6712860310421286, "grad_norm": 0.46195581555366516, "kl": 0.08674471825361252, "learning_rate": 4.796502516276777e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2422 }, { "completion_length": 534.0, "epoch": 0.6715631929046563, "grad_norm": 0.3663676083087921, "kl": 0.08634459972381592, "learning_rate": 4.796329471827289e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2423 }, { "completion_length": 521.75, "epoch": 0.6718403547671841, "grad_norm": 0.0, "kl": 0.1101851686835289, "learning_rate": 4.796156356958927e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2424 }, { "completion_length": 471.5, "epoch": 0.6721175166297118, "grad_norm": 0.3491523265838623, "kl": 0.07868814468383789, "learning_rate": 4.7959831716770025e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2425 }, { "completion_length": 455.0, "epoch": 0.6723946784922394, "grad_norm": 0.40517938137054443, "kl": 0.09387170523405075, "learning_rate": 4.795809915986824e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2426 }, { "completion_length": 418.0, "epoch": 0.6726718403547672, "grad_norm": 0.0, "kl": 0.09330562502145767, "learning_rate": 4.795636589893707e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2427 }, { "completion_length": 378.75, "epoch": 0.6729490022172949, "grad_norm": 0.0, "kl": 0.11366258561611176, "learning_rate": 4.795463193402965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2428 }, { "completion_length": 457.75, "epoch": 0.6732261640798226, "grad_norm": 0.4177197217941284, "kl": 0.10641977936029434, "learning_rate": 4.795289726519915e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2429 }, { "completion_length": 493.5, "epoch": 0.6735033259423503, "grad_norm": 0.3756590485572815, "kl": 0.1074524074792862, "learning_rate": 4.7951161892498775e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2430 }, { "completion_length": 424.5, "epoch": 0.6737804878048781, "grad_norm": 0.4373193681240082, "kl": 0.12767988443374634, "learning_rate": 4.7949425815981745e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2431 }, { "completion_length": 517.25, "epoch": 0.6740576496674058, "grad_norm": 0.3630236089229584, "kl": 0.10956472158432007, "learning_rate": 4.794768903570128e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2432 }, { "completion_length": 503.25, "epoch": 0.6743348115299335, "grad_norm": 0.3622356951236725, "kl": 0.08681051433086395, "learning_rate": 4.794595155171067e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2433 }, { "completion_length": 536.75, "epoch": 0.6746119733924612, "grad_norm": 0.40500351786613464, "kl": 0.08329901099205017, "learning_rate": 4.794421336406317e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2434 }, { "completion_length": 437.5, "epoch": 0.674889135254989, "grad_norm": 0.38139787316322327, "kl": 0.10830465704202652, "learning_rate": 4.794247447281209e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2435 }, { "completion_length": 403.0, "epoch": 0.6751662971175166, "grad_norm": 0.4589168429374695, "kl": 0.11722326278686523, "learning_rate": 4.794073487801076e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2436 }, { "completion_length": 416.75, "epoch": 0.6754434589800443, "grad_norm": 0.40768131613731384, "kl": 0.10555222630500793, "learning_rate": 4.793899457971252e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2437 }, { "completion_length": 380.0, "epoch": 0.6757206208425721, "grad_norm": 0.0, "kl": 0.11059778928756714, "learning_rate": 4.793725357797074e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2438 }, { "completion_length": 375.0, "epoch": 0.6759977827050998, "grad_norm": 0.470824658870697, "kl": 0.10935216397047043, "learning_rate": 4.793551187283881e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2439 }, { "completion_length": 415.75, "epoch": 0.6762749445676275, "grad_norm": 0.4020567238330841, "kl": 0.08527261018753052, "learning_rate": 4.793376946437014e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2440 }, { "completion_length": 356.5, "epoch": 0.6765521064301552, "grad_norm": 0.4684886634349823, "kl": 0.10447166115045547, "learning_rate": 4.793202635261816e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2441 }, { "completion_length": 446.5, "epoch": 0.676829268292683, "grad_norm": 0.3756389021873474, "kl": 0.0995296835899353, "learning_rate": 4.793028253763633e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2442 }, { "completion_length": 476.75, "epoch": 0.6771064301552107, "grad_norm": 0.39695852994918823, "kl": 0.08904004842042923, "learning_rate": 4.792853801947812e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2443 }, { "completion_length": 383.25, "epoch": 0.6773835920177383, "grad_norm": 0.0, "kl": 0.09706704318523407, "learning_rate": 4.792679279819703e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2444 }, { "completion_length": 449.0, "epoch": 0.677660753880266, "grad_norm": 0.5350883603096008, "kl": 0.1274198740720749, "learning_rate": 4.792504687384658e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2445 }, { "completion_length": 433.5, "epoch": 0.6779379157427938, "grad_norm": 0.6705424785614014, "kl": 0.10765786468982697, "learning_rate": 4.7923300246480305e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2446 }, { "completion_length": 431.75, "epoch": 0.6782150776053215, "grad_norm": 0.38701361417770386, "kl": 0.11525865644216537, "learning_rate": 4.792155291615177e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2447 }, { "completion_length": 443.75, "epoch": 0.6784922394678492, "grad_norm": 0.42991116642951965, "kl": 0.09030509740114212, "learning_rate": 4.791980488291457e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2448 }, { "completion_length": 373.0, "epoch": 0.6787694013303769, "grad_norm": 0.6561987996101379, "kl": 0.1587313711643219, "learning_rate": 4.791805614682228e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2449 }, { "completion_length": 465.0, "epoch": 0.6790465631929047, "grad_norm": 0.4110703468322754, "kl": 0.12704607844352722, "learning_rate": 4.7916306707928555e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2450 }, { "completion_length": 480.25, "epoch": 0.6793237250554324, "grad_norm": 0.4071118235588074, "kl": 0.09914778172969818, "learning_rate": 4.791455656628703e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2451 }, { "completion_length": 384.75, "epoch": 0.6796008869179601, "grad_norm": 0.0, "kl": 0.11374300718307495, "learning_rate": 4.791280572195138e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2452 }, { "completion_length": 415.75, "epoch": 0.6798780487804879, "grad_norm": 0.0, "kl": 0.0992097482085228, "learning_rate": 4.791105417497529e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2453 }, { "completion_length": 481.75, "epoch": 0.6801552106430155, "grad_norm": 0.385957807302475, "kl": 0.09293770045042038, "learning_rate": 4.790930192541248e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2454 }, { "completion_length": 375.0, "epoch": 0.6804323725055432, "grad_norm": 0.4513918161392212, "kl": 0.09203927963972092, "learning_rate": 4.790754897331668e-06, "loss": 0.0, "reward": 3.5, "reward_std": 2.629955530166626, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2455 }, { "completion_length": 434.0, "epoch": 0.6807095343680709, "grad_norm": 0.3699324131011963, "kl": 0.1039290800690651, "learning_rate": 4.790579531874164e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2456 }, { "completion_length": 451.75, "epoch": 0.6809866962305987, "grad_norm": 0.0, "kl": 0.10802675783634186, "learning_rate": 4.790404096174115e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2457 }, { "completion_length": 375.0, "epoch": 0.6812638580931264, "grad_norm": 0.0, "kl": 0.08161228150129318, "learning_rate": 4.790228590236899e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2458 }, { "completion_length": 422.25, "epoch": 0.6815410199556541, "grad_norm": 0.3987638056278229, "kl": 0.10637390613555908, "learning_rate": 4.790053014067902e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2459 }, { "completion_length": 433.0, "epoch": 0.6818181818181818, "grad_norm": 0.0, "kl": 0.09237166494131088, "learning_rate": 4.789877367672503e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2460 }, { "completion_length": 468.25, "epoch": 0.6820953436807096, "grad_norm": 0.0, "kl": 0.08563012629747391, "learning_rate": 4.789701651056092e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2461 }, { "completion_length": 527.5, "epoch": 0.6823725055432373, "grad_norm": 0.38907963037490845, "kl": 0.09930187463760376, "learning_rate": 4.789525864224055e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2462 }, { "completion_length": 475.25, "epoch": 0.6826496674057649, "grad_norm": 0.3867068290710449, "kl": 0.08120328187942505, "learning_rate": 4.789350007181784e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2463 }, { "completion_length": 442.75, "epoch": 0.6829268292682927, "grad_norm": 0.43075865507125854, "kl": 0.1058323010802269, "learning_rate": 4.789174079934673e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2464 }, { "completion_length": 425.0, "epoch": 0.6832039911308204, "grad_norm": 0.0, "kl": 0.10674567520618439, "learning_rate": 4.788998082488115e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2465 }, { "completion_length": 404.5, "epoch": 0.6834811529933481, "grad_norm": 0.0, "kl": 0.14683988690376282, "learning_rate": 4.7888220148475075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2466 }, { "completion_length": 451.0, "epoch": 0.6837583148558758, "grad_norm": 0.35410529375076294, "kl": 0.09054439514875412, "learning_rate": 4.788645877018251e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2467 }, { "completion_length": 483.5, "epoch": 0.6840354767184036, "grad_norm": 0.0, "kl": 0.09183159470558167, "learning_rate": 4.788469669005745e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2468 }, { "completion_length": 486.25, "epoch": 0.6843126385809313, "grad_norm": 0.39659610390663147, "kl": 0.09584081918001175, "learning_rate": 4.788293390815395e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2469 }, { "completion_length": 497.0, "epoch": 0.684589800443459, "grad_norm": 0.0, "kl": 0.08577048778533936, "learning_rate": 4.788117042452605e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2470 }, { "completion_length": 408.25, "epoch": 0.6848669623059866, "grad_norm": 0.42376407980918884, "kl": 0.09569185227155685, "learning_rate": 4.787940623922784e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2471 }, { "completion_length": 475.0, "epoch": 0.6851441241685144, "grad_norm": 0.0, "kl": 0.09823109209537506, "learning_rate": 4.787764135231342e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2472 }, { "completion_length": 419.75, "epoch": 0.6854212860310421, "grad_norm": 0.4879254698753357, "kl": 0.12322553247213364, "learning_rate": 4.78758757638369e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2473 }, { "completion_length": 507.75, "epoch": 0.6856984478935698, "grad_norm": 0.3474425971508026, "kl": 0.09380710124969482, "learning_rate": 4.787410947385243e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2474 }, { "completion_length": 444.0, "epoch": 0.6859756097560976, "grad_norm": 0.44219711422920227, "kl": 0.16361555457115173, "learning_rate": 4.787234248241418e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2475 }, { "completion_length": 439.75, "epoch": 0.6862527716186253, "grad_norm": 0.0, "kl": 0.1284743994474411, "learning_rate": 4.787057478957634e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2476 }, { "completion_length": 419.75, "epoch": 0.686529933481153, "grad_norm": 0.47189220786094666, "kl": 0.1103905737400055, "learning_rate": 4.7868806395393106e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2477 }, { "completion_length": 480.5, "epoch": 0.6868070953436807, "grad_norm": 0.40102118253707886, "kl": 0.09288360178470612, "learning_rate": 4.78670372999187e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2478 }, { "completion_length": 442.75, "epoch": 0.6870842572062085, "grad_norm": 0.37132784724235535, "kl": 0.12980227172374725, "learning_rate": 4.7865267503207405e-06, "loss": -0.0, "reward": 5.59375, "reward_std": 0.3125, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 2479 }, { "completion_length": 483.5, "epoch": 0.6873614190687362, "grad_norm": 0.4007933437824249, "kl": 0.09888216108083725, "learning_rate": 4.786349700531346e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2480 }, { "completion_length": 440.0, "epoch": 0.6876385809312638, "grad_norm": 0.0, "kl": 0.10093807429075241, "learning_rate": 4.786172580629118e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2481 }, { "completion_length": 426.5, "epoch": 0.6879157427937915, "grad_norm": 0.4256715774536133, "kl": 0.09205938130617142, "learning_rate": 4.785995390619487e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2482 }, { "completion_length": 444.25, "epoch": 0.6881929046563193, "grad_norm": 0.4210346043109894, "kl": 0.0969885066151619, "learning_rate": 4.785818130507886e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2483 }, { "completion_length": 435.5, "epoch": 0.688470066518847, "grad_norm": 0.0, "kl": 80.91039276123047, "learning_rate": 4.785640800299752e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2484 }, { "completion_length": 447.75, "epoch": 0.6887472283813747, "grad_norm": 0.0, "kl": 0.14640869200229645, "learning_rate": 4.785463400000523e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2485 }, { "completion_length": 367.0, "epoch": 0.6890243902439024, "grad_norm": 0.0, "kl": 0.10175693780183792, "learning_rate": 4.785285929615639e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2486 }, { "completion_length": 394.25, "epoch": 0.6893015521064302, "grad_norm": 0.0, "kl": 0.11870445311069489, "learning_rate": 4.7851083891505414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2487 }, { "completion_length": 473.75, "epoch": 0.6895787139689579, "grad_norm": 0.3782626688480377, "kl": 0.08862601220607758, "learning_rate": 4.784930778610676e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2488 }, { "completion_length": 455.0, "epoch": 0.6898558758314856, "grad_norm": 0.4162595272064209, "kl": 0.2935558259487152, "learning_rate": 4.7847530980014885e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2489 }, { "completion_length": 408.5, "epoch": 0.6901330376940134, "grad_norm": 0.5324833393096924, "kl": 0.10768117010593414, "learning_rate": 4.784575347328427e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2490 }, { "completion_length": 419.25, "epoch": 0.690410199556541, "grad_norm": 0.4172394275665283, "kl": 0.1257997751235962, "learning_rate": 4.784397526596945e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2491 }, { "completion_length": 375.0, "epoch": 0.6906873614190687, "grad_norm": 0.46389880776405334, "kl": 0.13891154527664185, "learning_rate": 4.784219635812493e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2492 }, { "completion_length": 390.25, "epoch": 0.6909645232815964, "grad_norm": 0.6500576138496399, "kl": 0.4608929753303528, "learning_rate": 4.784041674980526e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2493 }, { "completion_length": 401.25, "epoch": 0.6912416851441242, "grad_norm": 0.0, "kl": 0.1252494901418686, "learning_rate": 4.783863644106502e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2494 }, { "completion_length": 514.25, "epoch": 0.6915188470066519, "grad_norm": 0.4257895052433014, "kl": 0.09174524247646332, "learning_rate": 4.783685543195882e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2495 }, { "completion_length": 450.0, "epoch": 0.6917960088691796, "grad_norm": 0.0, "kl": 0.10486602038145065, "learning_rate": 4.783507372254126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2496 }, { "completion_length": 492.0, "epoch": 0.6920731707317073, "grad_norm": 0.3975176513195038, "kl": 0.09485910832881927, "learning_rate": 4.783329131286697e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2497 }, { "completion_length": 515.5, "epoch": 0.6923503325942351, "grad_norm": 0.0, "kl": 0.11971861124038696, "learning_rate": 4.783150820299064e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2498 }, { "completion_length": 440.0, "epoch": 0.6926274944567627, "grad_norm": 0.0, "kl": 0.09789334982633591, "learning_rate": 4.782972439296691e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2499 }, { "completion_length": 403.75, "epoch": 0.6929046563192904, "grad_norm": 0.0, "kl": 0.11133746057748795, "learning_rate": 4.782793988285051e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2500 }, { "completion_length": 416.25, "epoch": 0.6931818181818182, "grad_norm": 0.5488296151161194, "kl": 0.22669051587581635, "learning_rate": 4.782615467269616e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2501 }, { "completion_length": 499.25, "epoch": 0.6934589800443459, "grad_norm": 0.5133971571922302, "kl": 0.08835648000240326, "learning_rate": 4.7824368762558595e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2502 }, { "completion_length": 465.75, "epoch": 0.6937361419068736, "grad_norm": 0.4077387750148773, "kl": 0.10563038289546967, "learning_rate": 4.782258215249259e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2503 }, { "completion_length": 424.75, "epoch": 0.6940133037694013, "grad_norm": 0.0, "kl": 0.1918417066335678, "learning_rate": 4.782079484255292e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2504 }, { "completion_length": 356.5, "epoch": 0.6942904656319291, "grad_norm": 0.0, "kl": 0.09530667215585709, "learning_rate": 4.781900683279441e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2505 }, { "completion_length": 446.75, "epoch": 0.6945676274944568, "grad_norm": 0.4208531975746155, "kl": 0.10962629318237305, "learning_rate": 4.781721812327189e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2506 }, { "completion_length": 424.0, "epoch": 0.6948447893569845, "grad_norm": 0.4641960561275482, "kl": 0.13754421472549438, "learning_rate": 4.78154287140402e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2507 }, { "completion_length": 419.5, "epoch": 0.6951219512195121, "grad_norm": 0.0, "kl": 0.09331724792718887, "learning_rate": 4.781363860515423e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2508 }, { "completion_length": 490.75, "epoch": 0.6953991130820399, "grad_norm": 0.3662467300891876, "kl": 0.12517212331295013, "learning_rate": 4.781184779666887e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2509 }, { "completion_length": 385.75, "epoch": 0.6956762749445676, "grad_norm": 0.0, "kl": 0.10862032324075699, "learning_rate": 4.781005628863903e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2510 }, { "completion_length": 442.0, "epoch": 0.6959534368070953, "grad_norm": 0.0, "kl": 0.08922704309225082, "learning_rate": 4.7808264081119635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2511 }, { "completion_length": 384.25, "epoch": 0.6962305986696231, "grad_norm": 0.0, "kl": 0.09778928011655807, "learning_rate": 4.7806471174165676e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2512 }, { "completion_length": 398.25, "epoch": 0.6965077605321508, "grad_norm": 0.0, "kl": 0.1282838135957718, "learning_rate": 4.780467756783211e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2513 }, { "completion_length": 416.5, "epoch": 0.6967849223946785, "grad_norm": 0.46674636006355286, "kl": 0.11632272601127625, "learning_rate": 4.780288326217396e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2514 }, { "completion_length": 496.5, "epoch": 0.6970620842572062, "grad_norm": 0.38346603512763977, "kl": 0.10154680907726288, "learning_rate": 4.7801088257246235e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2515 }, { "completion_length": 415.5, "epoch": 0.697339246119734, "grad_norm": 0.4642426371574402, "kl": 0.24880369007587433, "learning_rate": 4.7799292553103974e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2516 }, { "completion_length": 465.0, "epoch": 0.6976164079822617, "grad_norm": 0.0, "kl": 0.15949714183807373, "learning_rate": 4.779749614980225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2517 }, { "completion_length": 434.25, "epoch": 0.6978935698447893, "grad_norm": 0.0, "kl": 0.12823349237442017, "learning_rate": 4.779569904739617e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2518 }, { "completion_length": 444.0, "epoch": 0.698170731707317, "grad_norm": 0.47707855701446533, "kl": 0.10306177288293839, "learning_rate": 4.779390124594082e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2519 }, { "completion_length": 423.5, "epoch": 0.6984478935698448, "grad_norm": 0.43479815125465393, "kl": 0.1158643364906311, "learning_rate": 4.7792102745491345e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2520 }, { "completion_length": 399.5, "epoch": 0.6987250554323725, "grad_norm": 0.4597238302230835, "kl": 0.12596656382083893, "learning_rate": 4.779030354610288e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2521 }, { "completion_length": 484.25, "epoch": 0.6990022172949002, "grad_norm": 0.5209359526634216, "kl": 0.15120075643062592, "learning_rate": 4.778850364783062e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2522 }, { "completion_length": 397.5, "epoch": 0.6992793791574279, "grad_norm": 0.0, "kl": 0.1137278601527214, "learning_rate": 4.778670305072974e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2523 }, { "completion_length": 410.5, "epoch": 0.6995565410199557, "grad_norm": 0.0, "kl": 0.13022767007350922, "learning_rate": 4.778490175485549e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2524 }, { "completion_length": 442.5, "epoch": 0.6998337028824834, "grad_norm": 0.0, "kl": 0.10704226791858673, "learning_rate": 4.7783099760263065e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2525 }, { "completion_length": 453.75, "epoch": 0.700110864745011, "grad_norm": 0.41657882928848267, "kl": 0.19394052028656006, "learning_rate": 4.778129706700775e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2526 }, { "completion_length": 461.5, "epoch": 0.7003880266075388, "grad_norm": 0.3990677297115326, "kl": 0.13786780834197998, "learning_rate": 4.777949367514482e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2527 }, { "completion_length": 486.25, "epoch": 0.7006651884700665, "grad_norm": 0.3566300570964813, "kl": 0.21028269827365875, "learning_rate": 4.777768958472958e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2528 }, { "completion_length": 486.0, "epoch": 0.7009423503325942, "grad_norm": 0.0, "kl": 0.11602574586868286, "learning_rate": 4.777588479581736e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2529 }, { "completion_length": 462.0, "epoch": 0.7012195121951219, "grad_norm": 0.0, "kl": 0.10619036108255386, "learning_rate": 4.777407930846349e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2530 }, { "completion_length": 503.25, "epoch": 0.7014966740576497, "grad_norm": 0.40112677216529846, "kl": 0.24741125106811523, "learning_rate": 4.777227312272335e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2531 }, { "completion_length": 429.25, "epoch": 0.7017738359201774, "grad_norm": 0.0, "kl": 0.13453423976898193, "learning_rate": 4.7770466238652336e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2532 }, { "completion_length": 442.0, "epoch": 0.7020509977827051, "grad_norm": 0.0, "kl": 0.15055108070373535, "learning_rate": 4.776865865630583e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2533 }, { "completion_length": 386.5, "epoch": 0.7023281596452328, "grad_norm": 0.0, "kl": 0.1249297484755516, "learning_rate": 4.7766850375739285e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2534 }, { "completion_length": 415.25, "epoch": 0.7026053215077606, "grad_norm": 0.0, "kl": 0.11431457847356796, "learning_rate": 4.776504139700814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2535 }, { "completion_length": 486.0, "epoch": 0.7028824833702882, "grad_norm": 0.0, "kl": 0.19967661798000336, "learning_rate": 4.776323172016788e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2536 }, { "completion_length": 410.75, "epoch": 0.7031596452328159, "grad_norm": 0.0, "kl": 0.16042429208755493, "learning_rate": 4.7761421345274e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2537 }, { "completion_length": 406.0, "epoch": 0.7034368070953437, "grad_norm": 0.4496612548828125, "kl": 0.14075972139835358, "learning_rate": 4.775961027238201e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2538 }, { "completion_length": 466.75, "epoch": 0.7037139689578714, "grad_norm": 0.0, "kl": 0.131073996424675, "learning_rate": 4.775779850154744e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2539 }, { "completion_length": 434.5, "epoch": 0.7039911308203991, "grad_norm": 0.5056547522544861, "kl": 0.13862639665603638, "learning_rate": 4.775598603282587e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2540 }, { "completion_length": 392.0, "epoch": 0.7042682926829268, "grad_norm": 0.0, "kl": 0.15579165518283844, "learning_rate": 4.775417286627287e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2541 }, { "completion_length": 374.75, "epoch": 0.7045454545454546, "grad_norm": 0.4298834204673767, "kl": 0.14595651626586914, "learning_rate": 4.775235900194404e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2542 }, { "completion_length": 462.0, "epoch": 0.7048226164079823, "grad_norm": 0.4301491677761078, "kl": 0.14863120019435883, "learning_rate": 4.775054443989501e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2543 }, { "completion_length": 437.5, "epoch": 0.70509977827051, "grad_norm": 0.0, "kl": 0.12518544495105743, "learning_rate": 4.774872918018142e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2544 }, { "completion_length": 452.0, "epoch": 0.7053769401330376, "grad_norm": 0.40393590927124023, "kl": 0.1017330139875412, "learning_rate": 4.774691322285894e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2545 }, { "completion_length": 435.5, "epoch": 0.7056541019955654, "grad_norm": 0.4776970148086548, "kl": 0.09889112412929535, "learning_rate": 4.774509656798326e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2546 }, { "completion_length": 553.0, "epoch": 0.7059312638580931, "grad_norm": 0.3920235335826874, "kl": 0.10416718572378159, "learning_rate": 4.774327921561008e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2547 }, { "completion_length": 375.25, "epoch": 0.7062084257206208, "grad_norm": 0.4970637857913971, "kl": 0.13552123308181763, "learning_rate": 4.7741461165795134e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2548 }, { "completion_length": 499.25, "epoch": 0.7064855875831486, "grad_norm": 0.0, "kl": 0.13488949835300446, "learning_rate": 4.773964241859418e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2549 }, { "completion_length": 440.5, "epoch": 0.7067627494456763, "grad_norm": 0.44247904419898987, "kl": 0.17842237651348114, "learning_rate": 4.773782297406298e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2550 }, { "completion_length": 459.0, "epoch": 0.707039911308204, "grad_norm": 0.39238059520721436, "kl": 0.1321287751197815, "learning_rate": 4.773600283225735e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2551 }, { "completion_length": 428.75, "epoch": 0.7073170731707317, "grad_norm": 0.46249040961265564, "kl": 0.14795584976673126, "learning_rate": 4.773418199323309e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2552 }, { "completion_length": 512.25, "epoch": 0.7075942350332595, "grad_norm": 0.0, "kl": 0.11214373260736465, "learning_rate": 4.773236045704603e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2553 }, { "completion_length": 422.5, "epoch": 0.7078713968957872, "grad_norm": 0.0, "kl": 0.1860760748386383, "learning_rate": 4.773053822375205e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2554 }, { "completion_length": 480.75, "epoch": 0.7081485587583148, "grad_norm": 0.0, "kl": 0.10756747424602509, "learning_rate": 4.772871529340701e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2555 }, { "completion_length": 489.0, "epoch": 0.7084257206208425, "grad_norm": 0.44462504982948303, "kl": 0.14523939788341522, "learning_rate": 4.772689166606683e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2556 }, { "completion_length": 437.25, "epoch": 0.7087028824833703, "grad_norm": 0.0, "kl": 0.14325262606143951, "learning_rate": 4.7725067341787414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2557 }, { "completion_length": 436.5, "epoch": 0.708980044345898, "grad_norm": 0.4087447226047516, "kl": 0.13489070534706116, "learning_rate": 4.772324232062473e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2558 }, { "completion_length": 427.25, "epoch": 0.7092572062084257, "grad_norm": 0.0, "kl": 0.17317424714565277, "learning_rate": 4.772141660263472e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2559 }, { "completion_length": 449.0, "epoch": 0.7095343680709535, "grad_norm": 0.0, "kl": 0.28067025542259216, "learning_rate": 4.771959018787338e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2560 }, { "completion_length": 420.0, "epoch": 0.7098115299334812, "grad_norm": 0.6485137939453125, "kl": 0.18574200570583344, "learning_rate": 4.771776307639673e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2561 }, { "completion_length": 469.0, "epoch": 0.7100886917960089, "grad_norm": 0.3993438184261322, "kl": 0.19251711666584015, "learning_rate": 4.771593526826078e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2562 }, { "completion_length": 492.5, "epoch": 0.7103658536585366, "grad_norm": 0.39043164253234863, "kl": 0.14858239889144897, "learning_rate": 4.77141067635216e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2563 }, { "completion_length": 426.5, "epoch": 0.7106430155210643, "grad_norm": 0.0, "kl": 0.3126022517681122, "learning_rate": 4.771227756223525e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2564 }, { "completion_length": 633.5, "epoch": 0.710920177383592, "grad_norm": 0.3862357437610626, "kl": 0.11021324247121811, "learning_rate": 4.771044766445783e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2565 }, { "completion_length": 431.75, "epoch": 0.7111973392461197, "grad_norm": 0.0, "kl": 0.12155352532863617, "learning_rate": 4.770861707024544e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2566 }, { "completion_length": 471.75, "epoch": 0.7114745011086474, "grad_norm": 0.37369871139526367, "kl": 0.12021665275096893, "learning_rate": 4.770678577965425e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2567 }, { "completion_length": 531.5, "epoch": 0.7117516629711752, "grad_norm": 0.0, "kl": 0.10523048043251038, "learning_rate": 4.770495379274039e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2568 }, { "completion_length": 449.5, "epoch": 0.7120288248337029, "grad_norm": 0.0, "kl": 0.14601249992847443, "learning_rate": 4.770312110956004e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2569 }, { "completion_length": 397.5, "epoch": 0.7123059866962306, "grad_norm": 0.0, "kl": 0.16705991327762604, "learning_rate": 4.770128773016942e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2570 }, { "completion_length": 437.5, "epoch": 0.7125831485587583, "grad_norm": 0.4361010789871216, "kl": 0.12414062768220901, "learning_rate": 4.769945365462473e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2571 }, { "completion_length": 465.75, "epoch": 0.7128603104212861, "grad_norm": 0.67332524061203, "kl": 0.10011205077171326, "learning_rate": 4.769761888298223e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2572 }, { "completion_length": 434.75, "epoch": 0.7131374722838137, "grad_norm": 0.46641820669174194, "kl": 0.19138018786907196, "learning_rate": 4.7695783415298176e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2573 }, { "completion_length": 407.0, "epoch": 0.7134146341463414, "grad_norm": 0.0, "kl": 0.1714315563440323, "learning_rate": 4.769394725162886e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2574 }, { "completion_length": 391.5, "epoch": 0.7136917960088692, "grad_norm": 0.0, "kl": 0.15789666771888733, "learning_rate": 4.7692110392030574e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2575 }, { "completion_length": 451.5, "epoch": 0.7139689578713969, "grad_norm": 0.0, "kl": 0.16959241032600403, "learning_rate": 4.769027283655967e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2576 }, { "completion_length": 542.75, "epoch": 0.7142461197339246, "grad_norm": 0.3541307747364044, "kl": 0.10981114953756332, "learning_rate": 4.768843458527249e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2577 }, { "completion_length": 440.0, "epoch": 0.7145232815964523, "grad_norm": 0.0, "kl": 0.15597139298915863, "learning_rate": 4.76865956382254e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2578 }, { "completion_length": 435.75, "epoch": 0.7148004434589801, "grad_norm": 0.44867125153541565, "kl": 0.1096101924777031, "learning_rate": 4.76847559954748e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2579 }, { "completion_length": 398.25, "epoch": 0.7150776053215078, "grad_norm": 0.0, "kl": 0.1321476399898529, "learning_rate": 4.768291565707709e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2580 }, { "completion_length": 423.75, "epoch": 0.7153547671840355, "grad_norm": 0.0, "kl": 0.11773857474327087, "learning_rate": 4.768107462308872e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2581 }, { "completion_length": 408.75, "epoch": 0.7156319290465631, "grad_norm": 0.5145248770713806, "kl": 0.12931695580482483, "learning_rate": 4.767923289356614e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2582 }, { "completion_length": 482.5, "epoch": 0.7159090909090909, "grad_norm": 0.4389020800590515, "kl": 0.11851652711629868, "learning_rate": 4.767739046856584e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2583 }, { "completion_length": 474.5, "epoch": 0.7161862527716186, "grad_norm": 0.0, "kl": 0.13561037182807922, "learning_rate": 4.76755473481443e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2584 }, { "completion_length": 484.0, "epoch": 0.7164634146341463, "grad_norm": 0.4691256582736969, "kl": 0.34676089882850647, "learning_rate": 4.767370353235805e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2585 }, { "completion_length": 438.0, "epoch": 0.7167405764966741, "grad_norm": 0.0, "kl": 0.12292744219303131, "learning_rate": 4.7671859021263635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2586 }, { "completion_length": 540.0, "epoch": 0.7170177383592018, "grad_norm": 0.3804081976413727, "kl": 0.1155308410525322, "learning_rate": 4.767001381491763e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2587 }, { "completion_length": 461.5, "epoch": 0.7172949002217295, "grad_norm": 0.7523672580718994, "kl": 0.1231570914387703, "learning_rate": 4.766816791337658e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2588 }, { "completion_length": 474.0, "epoch": 0.7175720620842572, "grad_norm": 0.430393248796463, "kl": 0.1255391389131546, "learning_rate": 4.766632131669714e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2589 }, { "completion_length": 492.25, "epoch": 0.717849223946785, "grad_norm": 0.38417643308639526, "kl": 0.12715396285057068, "learning_rate": 4.76644740249359e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2590 }, { "completion_length": 497.75, "epoch": 0.7181263858093127, "grad_norm": 0.39832717180252075, "kl": 0.11255522817373276, "learning_rate": 4.766262603814954e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2591 }, { "completion_length": 651.75, "epoch": 0.7184035476718403, "grad_norm": 0.0, "kl": 0.1722763627767563, "learning_rate": 4.76607773563947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2592 }, { "completion_length": 433.25, "epoch": 0.718680709534368, "grad_norm": 0.45066389441490173, "kl": 0.10701844096183777, "learning_rate": 4.7658927979728084e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2593 }, { "completion_length": 450.75, "epoch": 0.7189578713968958, "grad_norm": 0.0, "kl": 0.1295168250799179, "learning_rate": 4.7657077908206416e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2594 }, { "completion_length": 483.0, "epoch": 0.7192350332594235, "grad_norm": 0.0, "kl": 0.1755368411540985, "learning_rate": 4.7655227141886406e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2595 }, { "completion_length": 457.5, "epoch": 0.7195121951219512, "grad_norm": 0.4171423316001892, "kl": 0.10426945984363556, "learning_rate": 4.765337568082483e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2596 }, { "completion_length": 446.0, "epoch": 0.719789356984479, "grad_norm": 0.4476141035556793, "kl": 0.17364723980426788, "learning_rate": 4.765152352507846e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2597 }, { "completion_length": 482.0, "epoch": 0.7200665188470067, "grad_norm": 0.47274845838546753, "kl": 0.10454470664262772, "learning_rate": 4.764967067470409e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2598 }, { "completion_length": 433.25, "epoch": 0.7203436807095344, "grad_norm": 0.4827755093574524, "kl": 0.12862278521060944, "learning_rate": 4.764781712975854e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2599 }, { "completion_length": 517.5, "epoch": 0.720620842572062, "grad_norm": 0.39210256934165955, "kl": 0.16288040578365326, "learning_rate": 4.764596289029866e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2600 }, { "completion_length": 444.25, "epoch": 0.7208980044345898, "grad_norm": 0.4551485776901245, "kl": 0.15010227262973785, "learning_rate": 4.764410795638129e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2601 }, { "completion_length": 458.75, "epoch": 0.7211751662971175, "grad_norm": 0.0, "kl": 0.32462450861930847, "learning_rate": 4.764225232806333e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2602 }, { "completion_length": 423.75, "epoch": 0.7214523281596452, "grad_norm": 0.0, "kl": 0.15079984068870544, "learning_rate": 4.764039600540168e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2603 }, { "completion_length": 435.5, "epoch": 0.7217294900221729, "grad_norm": 0.44817402958869934, "kl": 0.14187569916248322, "learning_rate": 4.763853898845327e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2604 }, { "completion_length": 475.5, "epoch": 0.7220066518847007, "grad_norm": 0.0, "kl": 0.13072469830513, "learning_rate": 4.763668127727504e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2605 }, { "completion_length": 526.25, "epoch": 0.7222838137472284, "grad_norm": 0.0, "kl": 0.17007602751255035, "learning_rate": 4.763482287192396e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2606 }, { "completion_length": 574.75, "epoch": 0.7225609756097561, "grad_norm": 0.32204771041870117, "kl": 0.112642802298069, "learning_rate": 4.763296377245703e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2607 }, { "completion_length": 493.5, "epoch": 0.7228381374722838, "grad_norm": 0.3938816487789154, "kl": 0.11338631063699722, "learning_rate": 4.763110397893124e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2608 }, { "completion_length": 404.0, "epoch": 0.7231152993348116, "grad_norm": 0.0, "kl": 0.13606075942516327, "learning_rate": 4.762924349140363e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2609 }, { "completion_length": 529.75, "epoch": 0.7233924611973392, "grad_norm": 0.39182260632514954, "kl": 0.1326897144317627, "learning_rate": 4.762738230993128e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2610 }, { "completion_length": 409.75, "epoch": 0.7236696230598669, "grad_norm": 0.0, "kl": 1.2499638795852661, "learning_rate": 4.762552043457122e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2611 }, { "completion_length": 516.25, "epoch": 0.7239467849223947, "grad_norm": 0.4080416262149811, "kl": 0.10939490795135498, "learning_rate": 4.762365786538058e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2612 }, { "completion_length": 551.0, "epoch": 0.7242239467849224, "grad_norm": 0.0, "kl": 0.11862620711326599, "learning_rate": 4.762179460241646e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2613 }, { "completion_length": 563.5, "epoch": 0.7245011086474501, "grad_norm": 0.0, "kl": 0.11465094238519669, "learning_rate": 4.761993064573601e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2614 }, { "completion_length": 461.75, "epoch": 0.7247782705099778, "grad_norm": 0.42761290073394775, "kl": 0.12484212964773178, "learning_rate": 4.761806599539638e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2615 }, { "completion_length": 559.0, "epoch": 0.7250554323725056, "grad_norm": 0.33716946840286255, "kl": 0.11387840658426285, "learning_rate": 4.761620065145475e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2616 }, { "completion_length": 493.0, "epoch": 0.7253325942350333, "grad_norm": 0.0, "kl": 0.12548302114009857, "learning_rate": 4.761433461396833e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2617 }, { "completion_length": 552.5, "epoch": 0.725609756097561, "grad_norm": 0.0, "kl": 0.10993490368127823, "learning_rate": 4.761246788299435e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2618 }, { "completion_length": 543.75, "epoch": 0.7258869179600886, "grad_norm": 0.0, "kl": 0.117269366979599, "learning_rate": 4.761060045859003e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2619 }, { "completion_length": 495.25, "epoch": 0.7261640798226164, "grad_norm": 0.41023895144462585, "kl": 0.12752987444400787, "learning_rate": 4.760873234081267e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2620 }, { "completion_length": 465.25, "epoch": 0.7264412416851441, "grad_norm": 0.0, "kl": 0.3315819501876831, "learning_rate": 4.760686352971952e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2621 }, { "completion_length": 417.75, "epoch": 0.7267184035476718, "grad_norm": 0.0, "kl": 0.14057327806949615, "learning_rate": 4.760499402536792e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2622 }, { "completion_length": 503.25, "epoch": 0.7269955654101996, "grad_norm": 0.0, "kl": 0.18548637628555298, "learning_rate": 4.760312382781518e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2623 }, { "completion_length": 505.25, "epoch": 0.7272727272727273, "grad_norm": 0.3637387156486511, "kl": 0.1565323919057846, "learning_rate": 4.7601252937118665e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2624 }, { "completion_length": 455.0, "epoch": 0.727549889135255, "grad_norm": 0.40712466835975647, "kl": 0.1485731154680252, "learning_rate": 4.759938135333574e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2625 }, { "completion_length": 529.75, "epoch": 0.7278270509977827, "grad_norm": 0.37933167815208435, "kl": 0.14041480422019958, "learning_rate": 4.759750907652379e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2626 }, { "completion_length": 463.5, "epoch": 0.7281042128603105, "grad_norm": 0.5236484408378601, "kl": 0.4268907308578491, "learning_rate": 4.759563610674026e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2627 }, { "completion_length": 470.0, "epoch": 0.7283813747228381, "grad_norm": 0.43942078948020935, "kl": 0.1205759346485138, "learning_rate": 4.759376244404255e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2628 }, { "completion_length": 532.0, "epoch": 0.7286585365853658, "grad_norm": 0.0, "kl": 0.11496824026107788, "learning_rate": 4.759188808848814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2629 }, { "completion_length": 487.0, "epoch": 0.7289356984478935, "grad_norm": 0.4222484529018402, "kl": 0.3204265832901001, "learning_rate": 4.759001304013449e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2630 }, { "completion_length": 419.0, "epoch": 0.7292128603104213, "grad_norm": 0.5757212042808533, "kl": 0.14565108716487885, "learning_rate": 4.758813729903913e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2631 }, { "completion_length": 484.5, "epoch": 0.729490022172949, "grad_norm": 0.0, "kl": 0.2052452713251114, "learning_rate": 4.758626086525956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2632 }, { "completion_length": 443.0, "epoch": 0.7297671840354767, "grad_norm": 0.48949551582336426, "kl": 0.12220417708158493, "learning_rate": 4.758438373885333e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2633 }, { "completion_length": 520.0, "epoch": 0.7300443458980045, "grad_norm": 0.4258374571800232, "kl": 0.16262102127075195, "learning_rate": 4.758250591987799e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2634 }, { "completion_length": 521.25, "epoch": 0.7303215077605322, "grad_norm": 0.35432901978492737, "kl": 0.11777713894844055, "learning_rate": 4.758062740839113e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2635 }, { "completion_length": 568.0, "epoch": 0.7305986696230599, "grad_norm": 0.5209287405014038, "kl": 0.3918100893497467, "learning_rate": 4.7578748204450366e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2636 }, { "completion_length": 562.25, "epoch": 0.7308758314855875, "grad_norm": 0.4395611882209778, "kl": 0.1242109090089798, "learning_rate": 4.757686830811332e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2637 }, { "completion_length": 493.0, "epoch": 0.7311529933481153, "grad_norm": 0.0, "kl": 0.11573011428117752, "learning_rate": 4.757498771943765e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2638 }, { "completion_length": 454.0, "epoch": 0.731430155210643, "grad_norm": 0.0, "kl": 0.10879828035831451, "learning_rate": 4.7573106438481e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2639 }, { "completion_length": 477.25, "epoch": 0.7317073170731707, "grad_norm": 0.4890296459197998, "kl": 0.18933835625648499, "learning_rate": 4.757122446530108e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2640 }, { "completion_length": 817.25, "epoch": 0.7319844789356984, "grad_norm": 0.0, "kl": 0.08263709396123886, "learning_rate": 4.756934179995561e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2641 }, { "completion_length": 614.75, "epoch": 0.7322616407982262, "grad_norm": 0.2701183557510376, "kl": 0.08947141468524933, "learning_rate": 4.75674584425023e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2642 }, { "completion_length": 481.25, "epoch": 0.7325388026607539, "grad_norm": 0.4244672954082489, "kl": 0.1037135049700737, "learning_rate": 4.7565574392998935e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2643 }, { "completion_length": 452.25, "epoch": 0.7328159645232816, "grad_norm": 0.5376319289207458, "kl": 0.10399101674556732, "learning_rate": 4.756368965150326e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2644 }, { "completion_length": 484.5, "epoch": 0.7330931263858093, "grad_norm": 0.0, "kl": 0.1519468128681183, "learning_rate": 4.756180421807309e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2645 }, { "completion_length": 462.75, "epoch": 0.733370288248337, "grad_norm": 0.39180442690849304, "kl": 0.11748431622982025, "learning_rate": 4.755991809276623e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2646 }, { "completion_length": 484.25, "epoch": 0.7336474501108647, "grad_norm": 0.4808994233608246, "kl": 0.15388011932373047, "learning_rate": 4.755803127564055e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2647 }, { "completion_length": 393.75, "epoch": 0.7339246119733924, "grad_norm": 0.0, "kl": 0.11530598998069763, "learning_rate": 4.755614376675387e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2648 }, { "completion_length": 440.5, "epoch": 0.7342017738359202, "grad_norm": 0.0, "kl": 0.11325935274362564, "learning_rate": 4.755425556616411e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2649 }, { "completion_length": 438.25, "epoch": 0.7344789356984479, "grad_norm": 0.0, "kl": 0.12824209034442902, "learning_rate": 4.755236667392914e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2650 }, { "completion_length": 398.25, "epoch": 0.7347560975609756, "grad_norm": 0.0, "kl": 0.17472267150878906, "learning_rate": 4.755047709010691e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2651 }, { "completion_length": 466.5, "epoch": 0.7350332594235033, "grad_norm": 0.0, "kl": 0.13525405526161194, "learning_rate": 4.754858681475534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2652 }, { "completion_length": 376.25, "epoch": 0.7353104212860311, "grad_norm": 0.0, "kl": 0.1625170111656189, "learning_rate": 4.754669584793242e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2653 }, { "completion_length": 449.75, "epoch": 0.7355875831485588, "grad_norm": 0.0, "kl": 0.14406250417232513, "learning_rate": 4.754480418969614e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2654 }, { "completion_length": 445.0, "epoch": 0.7358647450110865, "grad_norm": 0.0, "kl": 0.1431419402360916, "learning_rate": 4.754291184010449e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2655 }, { "completion_length": 441.25, "epoch": 0.7361419068736141, "grad_norm": 0.0, "kl": 0.14548397064208984, "learning_rate": 4.754101879921551e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2656 }, { "completion_length": 446.25, "epoch": 0.7364190687361419, "grad_norm": 0.4340497851371765, "kl": 0.10789433121681213, "learning_rate": 4.753912506708726e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2657 }, { "completion_length": 558.75, "epoch": 0.7366962305986696, "grad_norm": 0.4712428152561188, "kl": 0.1047641858458519, "learning_rate": 4.75372306437778e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2658 }, { "completion_length": 460.5, "epoch": 0.7369733924611973, "grad_norm": 0.4387286603450775, "kl": 0.41428858041763306, "learning_rate": 4.753533552934522e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2659 }, { "completion_length": 485.5, "epoch": 0.7372505543237251, "grad_norm": 0.0, "kl": 0.11455868929624557, "learning_rate": 4.753343972384765e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2660 }, { "completion_length": 441.75, "epoch": 0.7375277161862528, "grad_norm": 0.47126391530036926, "kl": 0.11341482400894165, "learning_rate": 4.753154322734322e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2661 }, { "completion_length": 415.75, "epoch": 0.7378048780487805, "grad_norm": 0.0, "kl": 0.13225583732128143, "learning_rate": 4.752964603989009e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2662 }, { "completion_length": 431.0, "epoch": 0.7380820399113082, "grad_norm": 0.0, "kl": 0.14290772378444672, "learning_rate": 4.752774816154644e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2663 }, { "completion_length": 498.0, "epoch": 0.738359201773836, "grad_norm": 0.42104727029800415, "kl": 0.13157376646995544, "learning_rate": 4.752584959237047e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2664 }, { "completion_length": 509.25, "epoch": 0.7386363636363636, "grad_norm": 0.0, "kl": 0.1724664270877838, "learning_rate": 4.7523950332420385e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2665 }, { "completion_length": 516.5, "epoch": 0.7389135254988913, "grad_norm": 0.39125221967697144, "kl": 0.10790521651506424, "learning_rate": 4.752205038175445e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2666 }, { "completion_length": 445.75, "epoch": 0.739190687361419, "grad_norm": 0.4259146451950073, "kl": 0.20252244174480438, "learning_rate": 4.752014974043092e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2667 }, { "completion_length": 502.0, "epoch": 0.7394678492239468, "grad_norm": 0.0, "kl": 0.11150974780321121, "learning_rate": 4.751824840850807e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2668 }, { "completion_length": 444.25, "epoch": 0.7397450110864745, "grad_norm": 0.42699289321899414, "kl": 0.13915392756462097, "learning_rate": 4.751634638604422e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2669 }, { "completion_length": 433.25, "epoch": 0.7400221729490022, "grad_norm": 0.0, "kl": 0.13930220901966095, "learning_rate": 4.75144436730977e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2670 }, { "completion_length": 537.25, "epoch": 0.74029933481153, "grad_norm": 0.3739576041698456, "kl": 0.11212605237960815, "learning_rate": 4.751254026972684e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2671 }, { "completion_length": 481.0, "epoch": 0.7405764966740577, "grad_norm": 0.0, "kl": 0.11548470705747604, "learning_rate": 4.751063617599002e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2672 }, { "completion_length": 458.25, "epoch": 0.7408536585365854, "grad_norm": 0.3835342228412628, "kl": 0.13952817022800446, "learning_rate": 4.750873139194563e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2673 }, { "completion_length": 445.5, "epoch": 0.741130820399113, "grad_norm": 0.43924781680107117, "kl": 0.13728734850883484, "learning_rate": 4.750682591765209e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2674 }, { "completion_length": 510.5, "epoch": 0.7414079822616408, "grad_norm": 0.0, "kl": 0.11686185747385025, "learning_rate": 4.750491975316783e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2675 }, { "completion_length": 437.5, "epoch": 0.7416851441241685, "grad_norm": 0.0, "kl": 0.141534224152565, "learning_rate": 4.750301289855128e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2676 }, { "completion_length": 484.5, "epoch": 0.7419623059866962, "grad_norm": 0.31425637006759644, "kl": 0.1308365762233734, "learning_rate": 4.750110535386094e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2677 }, { "completion_length": 434.25, "epoch": 0.7422394678492239, "grad_norm": 0.4155649244785309, "kl": 0.12683477997779846, "learning_rate": 4.749919711915531e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2678 }, { "completion_length": 428.0, "epoch": 0.7425166297117517, "grad_norm": 0.4168071448802948, "kl": 0.14565134048461914, "learning_rate": 4.749728819449289e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2679 }, { "completion_length": 497.25, "epoch": 0.7427937915742794, "grad_norm": 0.38339245319366455, "kl": 0.1532461941242218, "learning_rate": 4.749537857993224e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2680 }, { "completion_length": 486.5, "epoch": 0.7430709534368071, "grad_norm": 0.0, "kl": 0.1890948861837387, "learning_rate": 4.74934682755319e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2681 }, { "completion_length": 484.25, "epoch": 0.7433481152993349, "grad_norm": 0.3825908601284027, "kl": 0.13011963665485382, "learning_rate": 4.7491557281350455e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2682 }, { "completion_length": 368.75, "epoch": 0.7436252771618626, "grad_norm": 0.0, "kl": 0.11127598583698273, "learning_rate": 4.748964559744651e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2683 }, { "completion_length": 500.0, "epoch": 0.7439024390243902, "grad_norm": 0.4742351770401001, "kl": 0.328716903924942, "learning_rate": 4.748773322387869e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2684 }, { "completion_length": 451.5, "epoch": 0.7441796008869179, "grad_norm": 0.0, "kl": 0.1370600163936615, "learning_rate": 4.748582016070564e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2685 }, { "completion_length": 548.5, "epoch": 0.7444567627494457, "grad_norm": 0.425271213054657, "kl": 0.11424241214990616, "learning_rate": 4.748390640798602e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2686 }, { "completion_length": 517.5, "epoch": 0.7447339246119734, "grad_norm": 0.36824488639831543, "kl": 0.13088613748550415, "learning_rate": 4.748199196577853e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2687 }, { "completion_length": 479.75, "epoch": 0.7450110864745011, "grad_norm": 0.0, "kl": 0.18377619981765747, "learning_rate": 4.7480076834141865e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2688 }, { "completion_length": 475.75, "epoch": 0.7452882483370288, "grad_norm": 0.3635919690132141, "kl": 0.13013678789138794, "learning_rate": 4.747816101313476e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2689 }, { "completion_length": 489.5, "epoch": 0.7455654101995566, "grad_norm": 0.38327261805534363, "kl": 0.16692772507667542, "learning_rate": 4.7476244502815966e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2690 }, { "completion_length": 419.0, "epoch": 0.7458425720620843, "grad_norm": 0.37714871764183044, "kl": 0.14568589627742767, "learning_rate": 4.747432730324425e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2691 }, { "completion_length": 520.75, "epoch": 0.746119733924612, "grad_norm": 0.3466518521308899, "kl": 0.1045876145362854, "learning_rate": 4.74724094144784e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2692 }, { "completion_length": 465.5, "epoch": 0.7463968957871396, "grad_norm": 0.4989699125289917, "kl": 0.21574901044368744, "learning_rate": 4.747049083657724e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2693 }, { "completion_length": 412.0, "epoch": 0.7466740576496674, "grad_norm": 0.5331481099128723, "kl": 0.1256663203239441, "learning_rate": 4.7468571569599605e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2694 }, { "completion_length": 488.5, "epoch": 0.7469512195121951, "grad_norm": 0.367993026971817, "kl": 0.1440844088792801, "learning_rate": 4.7466651613604345e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2695 }, { "completion_length": 464.0, "epoch": 0.7472283813747228, "grad_norm": 0.4457774758338928, "kl": 0.14523518085479736, "learning_rate": 4.746473096865034e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2696 }, { "completion_length": 437.25, "epoch": 0.7475055432372506, "grad_norm": 0.4423355460166931, "kl": 0.14285816252231598, "learning_rate": 4.746280963479648e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2697 }, { "completion_length": 513.75, "epoch": 0.7477827050997783, "grad_norm": 0.0, "kl": 0.1214366927742958, "learning_rate": 4.7460887612101704e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2698 }, { "completion_length": 461.75, "epoch": 0.748059866962306, "grad_norm": 0.0, "kl": 0.1593211442232132, "learning_rate": 4.745896490062493e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2699 }, { "completion_length": 476.0, "epoch": 0.7483370288248337, "grad_norm": 0.0, "kl": 0.12763622403144836, "learning_rate": 4.745704150042513e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2700 }, { "completion_length": 503.25, "epoch": 0.7486141906873615, "grad_norm": 0.393824964761734, "kl": 0.1108083426952362, "learning_rate": 4.745511741156129e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2701 }, { "completion_length": 471.5, "epoch": 0.7488913525498891, "grad_norm": 0.41478466987609863, "kl": 0.15295828878879547, "learning_rate": 4.745319263409241e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2702 }, { "completion_length": 524.25, "epoch": 0.7491685144124168, "grad_norm": 0.0, "kl": 0.2582961320877075, "learning_rate": 4.7451267168077506e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2703 }, { "completion_length": 453.25, "epoch": 0.7494456762749445, "grad_norm": 0.36669930815696716, "kl": 0.13299721479415894, "learning_rate": 4.744934101357565e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2704 }, { "completion_length": 394.25, "epoch": 0.7497228381374723, "grad_norm": 0.0, "kl": 0.13813528418540955, "learning_rate": 4.744741417064587e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2705 }, { "completion_length": 369.75, "epoch": 0.75, "grad_norm": 0.47428786754608154, "kl": 0.16299650073051453, "learning_rate": 4.74454866393473e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2706 }, { "completion_length": 404.5, "epoch": 0.7502771618625277, "grad_norm": 0.0, "kl": 0.13488705456256866, "learning_rate": 4.744355841973901e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2707 }, { "completion_length": 543.75, "epoch": 0.7505543237250555, "grad_norm": 0.368033230304718, "kl": 0.11394274979829788, "learning_rate": 4.744162951188015e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2708 }, { "completion_length": 496.5, "epoch": 0.7508314855875832, "grad_norm": 0.37519562244415283, "kl": 0.11291852593421936, "learning_rate": 4.743969991582986e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2709 }, { "completion_length": 543.75, "epoch": 0.7511086474501109, "grad_norm": 0.3472137153148651, "kl": 0.14750082790851593, "learning_rate": 4.743776963164733e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2710 }, { "completion_length": 429.75, "epoch": 0.7513858093126385, "grad_norm": 0.39675453305244446, "kl": 0.12448649108409882, "learning_rate": 4.743583865939175e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2711 }, { "completion_length": 417.75, "epoch": 0.7516629711751663, "grad_norm": 0.0, "kl": 0.1408003270626068, "learning_rate": 4.743390699912232e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2712 }, { "completion_length": 475.75, "epoch": 0.751940133037694, "grad_norm": 0.41811221837997437, "kl": 0.11439573764801025, "learning_rate": 4.743197465089828e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2713 }, { "completion_length": 809.75, "epoch": 0.7522172949002217, "grad_norm": 0.4306460916996002, "kl": 0.09521330147981644, "learning_rate": 4.74300416147789e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2714 }, { "completion_length": 459.25, "epoch": 0.7524944567627494, "grad_norm": 0.0, "kl": 0.11935567855834961, "learning_rate": 4.742810789082345e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2715 }, { "completion_length": 406.25, "epoch": 0.7527716186252772, "grad_norm": 0.0, "kl": 0.1395460069179535, "learning_rate": 4.7426173479091235e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2716 }, { "completion_length": 455.75, "epoch": 0.7530487804878049, "grad_norm": 0.38981908559799194, "kl": 0.11247662454843521, "learning_rate": 4.742423837964156e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2717 }, { "completion_length": 441.5, "epoch": 0.7533259423503326, "grad_norm": 0.43648630380630493, "kl": 0.545258104801178, "learning_rate": 4.742230259253378e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2718 }, { "completion_length": 467.25, "epoch": 0.7536031042128604, "grad_norm": 0.5175750851631165, "kl": 0.18290014564990997, "learning_rate": 4.742036611782726e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2719 }, { "completion_length": 450.75, "epoch": 0.753880266075388, "grad_norm": 0.46664005517959595, "kl": 0.42275500297546387, "learning_rate": 4.741842895558137e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2720 }, { "completion_length": 415.0, "epoch": 0.7541574279379157, "grad_norm": 0.0, "kl": 0.14381884038448334, "learning_rate": 4.741649110585552e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2721 }, { "completion_length": 531.0, "epoch": 0.7544345898004434, "grad_norm": 0.0, "kl": 0.15964575111865997, "learning_rate": 4.741455256870914e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2722 }, { "completion_length": 425.75, "epoch": 0.7547117516629712, "grad_norm": 0.0, "kl": 0.12904517352581024, "learning_rate": 4.741261334420168e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2723 }, { "completion_length": 530.0, "epoch": 0.7549889135254989, "grad_norm": 0.35885223746299744, "kl": 0.1321117877960205, "learning_rate": 4.74106734323926e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2724 }, { "completion_length": 500.25, "epoch": 0.7552660753880266, "grad_norm": 0.4559873342514038, "kl": 0.28646501898765564, "learning_rate": 4.740873283334139e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2725 }, { "completion_length": 448.25, "epoch": 0.7555432372505543, "grad_norm": 0.0, "kl": 0.13123241066932678, "learning_rate": 4.740679154710757e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2726 }, { "completion_length": 486.75, "epoch": 0.7558203991130821, "grad_norm": 0.37792250514030457, "kl": 0.17713014781475067, "learning_rate": 4.740484957375065e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2727 }, { "completion_length": 456.75, "epoch": 0.7560975609756098, "grad_norm": 0.4761711061000824, "kl": 0.12683430314064026, "learning_rate": 4.740290691333021e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2728 }, { "completion_length": 496.75, "epoch": 0.7563747228381374, "grad_norm": 0.0, "kl": 0.14713342487812042, "learning_rate": 4.74009635659058e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2729 }, { "completion_length": 401.75, "epoch": 0.7566518847006651, "grad_norm": 0.4410831332206726, "kl": 0.13598093390464783, "learning_rate": 4.739901953153702e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2730 }, { "completion_length": 484.25, "epoch": 0.7569290465631929, "grad_norm": 0.3868559002876282, "kl": 0.12973995506763458, "learning_rate": 4.739707481028349e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2731 }, { "completion_length": 413.5, "epoch": 0.7572062084257206, "grad_norm": 0.4679473340511322, "kl": 0.27499884366989136, "learning_rate": 4.739512940220484e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2732 }, { "completion_length": 414.25, "epoch": 0.7574833702882483, "grad_norm": 0.4443478286266327, "kl": 0.14039358496665955, "learning_rate": 4.739318330736075e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2733 }, { "completion_length": 435.25, "epoch": 0.7577605321507761, "grad_norm": 0.0, "kl": 0.13759902119636536, "learning_rate": 4.7391236525810865e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2734 }, { "completion_length": 486.25, "epoch": 0.7580376940133038, "grad_norm": 0.0, "kl": 0.10782123357057571, "learning_rate": 4.738928905761491e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2735 }, { "completion_length": 436.5, "epoch": 0.7583148558758315, "grad_norm": 0.42395666241645813, "kl": 0.1141713410615921, "learning_rate": 4.73873409028326e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2736 }, { "completion_length": 426.5, "epoch": 0.7585920177383592, "grad_norm": 0.0, "kl": 0.15990538895130157, "learning_rate": 4.7385392061523666e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2737 }, { "completion_length": 599.25, "epoch": 0.758869179600887, "grad_norm": 0.44514229893684387, "kl": 0.09807313233613968, "learning_rate": 4.738344253374788e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2738 }, { "completion_length": 396.0, "epoch": 0.7591463414634146, "grad_norm": 0.4717368483543396, "kl": 0.18029940128326416, "learning_rate": 4.738149231956503e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2739 }, { "completion_length": 452.25, "epoch": 0.7594235033259423, "grad_norm": 0.43387851119041443, "kl": 0.36267852783203125, "learning_rate": 4.73795414190349e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2740 }, { "completion_length": 444.75, "epoch": 0.75970066518847, "grad_norm": 0.0, "kl": 0.14544107019901276, "learning_rate": 4.737758983221734e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2741 }, { "completion_length": 409.25, "epoch": 0.7599778270509978, "grad_norm": 0.41713181138038635, "kl": 0.12500053644180298, "learning_rate": 4.737563755917219e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2742 }, { "completion_length": 455.0, "epoch": 0.7602549889135255, "grad_norm": 0.3756922781467438, "kl": 0.14143961668014526, "learning_rate": 4.737368459995933e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2743 }, { "completion_length": 530.0, "epoch": 0.7605321507760532, "grad_norm": 0.0, "kl": 0.12417665123939514, "learning_rate": 4.737173095463862e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2744 }, { "completion_length": 426.5, "epoch": 0.760809312638581, "grad_norm": 0.500941812992096, "kl": 0.14287890493869781, "learning_rate": 4.736977662326998e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2745 }, { "completion_length": 544.25, "epoch": 0.7610864745011087, "grad_norm": 0.0, "kl": 0.11830296367406845, "learning_rate": 4.7367821605913355e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2746 }, { "completion_length": 472.25, "epoch": 0.7613636363636364, "grad_norm": 0.46466290950775146, "kl": 0.14854049682617188, "learning_rate": 4.736586590262869e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2747 }, { "completion_length": 423.5, "epoch": 0.761640798226164, "grad_norm": 0.4133607745170593, "kl": 0.1347285956144333, "learning_rate": 4.736390951347596e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2748 }, { "completion_length": 406.25, "epoch": 0.7619179600886918, "grad_norm": 0.0, "kl": 0.11858431994915009, "learning_rate": 4.736195243851515e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2749 }, { "completion_length": 497.0, "epoch": 0.7621951219512195, "grad_norm": 0.0, "kl": 0.12458670884370804, "learning_rate": 4.735999467780628e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2750 }, { "completion_length": 416.25, "epoch": 0.7624722838137472, "grad_norm": 0.0, "kl": 0.15798018872737885, "learning_rate": 4.73580362314094e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2751 }, { "completion_length": 519.75, "epoch": 0.7627494456762749, "grad_norm": 0.42239344120025635, "kl": 0.12093277275562286, "learning_rate": 4.735607709938455e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2752 }, { "completion_length": 412.5, "epoch": 0.7630266075388027, "grad_norm": 0.4825350046157837, "kl": 0.17275376617908478, "learning_rate": 4.735411728179181e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2753 }, { "completion_length": 515.0, "epoch": 0.7633037694013304, "grad_norm": 0.0, "kl": 0.14357970654964447, "learning_rate": 4.735215677869129e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2754 }, { "completion_length": 397.75, "epoch": 0.7635809312638581, "grad_norm": 0.0, "kl": 0.12474759668111801, "learning_rate": 4.73501955901431e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2755 }, { "completion_length": 520.5, "epoch": 0.7638580931263859, "grad_norm": 0.3606237769126892, "kl": 0.11723088473081589, "learning_rate": 4.734823371620738e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2756 }, { "completion_length": 466.25, "epoch": 0.7641352549889135, "grad_norm": 0.4275756776332855, "kl": 0.1437297761440277, "learning_rate": 4.73462711569443e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2757 }, { "completion_length": 588.5, "epoch": 0.7644124168514412, "grad_norm": 0.4305463433265686, "kl": 0.1444823145866394, "learning_rate": 4.734430791241406e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2758 }, { "completion_length": 556.0, "epoch": 0.7646895787139689, "grad_norm": 0.35965195298194885, "kl": 0.13030539453029633, "learning_rate": 4.734234398267683e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2759 }, { "completion_length": 538.75, "epoch": 0.7649667405764967, "grad_norm": 0.32442253828048706, "kl": 0.13493552803993225, "learning_rate": 4.734037936779286e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2760 }, { "completion_length": 613.5, "epoch": 0.7652439024390244, "grad_norm": 0.0, "kl": 0.10496439784765244, "learning_rate": 4.733841406782238e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2761 }, { "completion_length": 456.75, "epoch": 0.7655210643015521, "grad_norm": 0.4472352862358093, "kl": 0.1293545514345169, "learning_rate": 4.733644808282567e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2762 }, { "completion_length": 483.75, "epoch": 0.7657982261640798, "grad_norm": 0.4425487816333771, "kl": 0.1593986600637436, "learning_rate": 4.733448141286302e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2763 }, { "completion_length": 429.25, "epoch": 0.7660753880266076, "grad_norm": 0.40334978699684143, "kl": 0.19792333245277405, "learning_rate": 4.733251405799472e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2764 }, { "completion_length": 490.25, "epoch": 0.7663525498891353, "grad_norm": 0.0, "kl": 0.13641412556171417, "learning_rate": 4.733054601828112e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2765 }, { "completion_length": 445.0, "epoch": 0.766629711751663, "grad_norm": 0.0, "kl": 0.2247895896434784, "learning_rate": 4.732857729378257e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2766 }, { "completion_length": 474.5, "epoch": 0.7669068736141907, "grad_norm": 0.6473143696784973, "kl": 0.1678345650434494, "learning_rate": 4.732660788455944e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2767 }, { "completion_length": 461.75, "epoch": 0.7671840354767184, "grad_norm": 0.0, "kl": 0.1573033481836319, "learning_rate": 4.732463779067212e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2768 }, { "completion_length": 440.5, "epoch": 0.7674611973392461, "grad_norm": 0.0, "kl": 0.15006023645401, "learning_rate": 4.732266701218103e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2769 }, { "completion_length": 492.0, "epoch": 0.7677383592017738, "grad_norm": 0.0, "kl": 0.13142609596252441, "learning_rate": 4.73206955491466e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2770 }, { "completion_length": 533.5, "epoch": 0.7680155210643016, "grad_norm": 0.38653314113616943, "kl": 0.1121411845088005, "learning_rate": 4.73187234016293e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2771 }, { "completion_length": 477.75, "epoch": 0.7682926829268293, "grad_norm": 0.0, "kl": 0.13812687993049622, "learning_rate": 4.731675056968958e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2772 }, { "completion_length": 425.75, "epoch": 0.768569844789357, "grad_norm": 0.0, "kl": 0.16566617786884308, "learning_rate": 4.7314777053387965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2773 }, { "completion_length": 412.25, "epoch": 0.7688470066518847, "grad_norm": 0.0, "kl": 0.1341853141784668, "learning_rate": 4.731280285278497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2774 }, { "completion_length": 422.0, "epoch": 0.7691241685144125, "grad_norm": 0.0, "kl": 0.11995651572942734, "learning_rate": 4.731082796794112e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2775 }, { "completion_length": 476.25, "epoch": 0.7694013303769401, "grad_norm": 0.0, "kl": 0.14839617908000946, "learning_rate": 4.730885239891699e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2776 }, { "completion_length": 429.5, "epoch": 0.7696784922394678, "grad_norm": 0.4844661355018616, "kl": 0.13900938630104065, "learning_rate": 4.730687614577316e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2777 }, { "completion_length": 520.75, "epoch": 0.7699556541019955, "grad_norm": 0.0, "kl": 0.10554308444261551, "learning_rate": 4.730489920857023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2778 }, { "completion_length": 562.75, "epoch": 0.7702328159645233, "grad_norm": 0.38792115449905396, "kl": 0.14481280744075775, "learning_rate": 4.730292158736883e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2779 }, { "completion_length": 437.0, "epoch": 0.770509977827051, "grad_norm": 0.4327208995819092, "kl": 0.14287610352039337, "learning_rate": 4.730094328222961e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2780 }, { "completion_length": 493.25, "epoch": 0.7707871396895787, "grad_norm": 0.4514303505420685, "kl": 0.14974652230739594, "learning_rate": 4.729896429321322e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2781 }, { "completion_length": 438.0, "epoch": 0.7710643015521065, "grad_norm": 0.4792897403240204, "kl": 0.1588352471590042, "learning_rate": 4.729698462038036e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2782 }, { "completion_length": 451.25, "epoch": 0.7713414634146342, "grad_norm": 0.4377879798412323, "kl": 0.40036633610725403, "learning_rate": 4.7295004263791724e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2783 }, { "completion_length": 480.5, "epoch": 0.7716186252771619, "grad_norm": 0.0, "kl": 0.15074887871742249, "learning_rate": 4.729302322350807e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2784 }, { "completion_length": 378.75, "epoch": 0.7718957871396895, "grad_norm": 0.0, "kl": 0.14618904888629913, "learning_rate": 4.729104149959011e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2785 }, { "completion_length": 499.0, "epoch": 0.7721729490022173, "grad_norm": 0.4073076844215393, "kl": 0.12414220720529556, "learning_rate": 4.728905909209866e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2786 }, { "completion_length": 421.25, "epoch": 0.772450110864745, "grad_norm": 0.0, "kl": 0.15361668169498444, "learning_rate": 4.728707600109447e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2787 }, { "completion_length": 513.0, "epoch": 0.7727272727272727, "grad_norm": 0.4090654253959656, "kl": 0.14115943014621735, "learning_rate": 4.728509222663837e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2788 }, { "completion_length": 453.0, "epoch": 0.7730044345898004, "grad_norm": 0.4765729308128357, "kl": 0.14809130132198334, "learning_rate": 4.7283107768791205e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2789 }, { "completion_length": 447.0, "epoch": 0.7732815964523282, "grad_norm": 0.0, "kl": 0.15359173715114594, "learning_rate": 4.728112262761382e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2790 }, { "completion_length": 472.0, "epoch": 0.7735587583148559, "grad_norm": 0.4440651834011078, "kl": 0.1441047191619873, "learning_rate": 4.72791368031671e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2791 }, { "completion_length": 445.75, "epoch": 0.7738359201773836, "grad_norm": 0.45398181676864624, "kl": 0.14718802273273468, "learning_rate": 4.727715029551191e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2792 }, { "completion_length": 421.25, "epoch": 0.7741130820399114, "grad_norm": 0.0, "kl": 0.16041794419288635, "learning_rate": 4.72751631047092e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2793 }, { "completion_length": 525.75, "epoch": 0.774390243902439, "grad_norm": 0.3900003433227539, "kl": 0.16172759234905243, "learning_rate": 4.72731752308199e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2794 }, { "completion_length": 472.5, "epoch": 0.7746674057649667, "grad_norm": 0.3550756275653839, "kl": 0.09592204540967941, "learning_rate": 4.7271186673904975e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2795 }, { "completion_length": 458.75, "epoch": 0.7749445676274944, "grad_norm": 0.3631892800331116, "kl": 0.11739855259656906, "learning_rate": 4.7269197434025394e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2796 }, { "completion_length": 492.0, "epoch": 0.7752217294900222, "grad_norm": 0.0, "kl": 0.21656320989131927, "learning_rate": 4.726720751124218e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2797 }, { "completion_length": 540.75, "epoch": 0.7754988913525499, "grad_norm": 0.0, "kl": 0.10212673246860504, "learning_rate": 4.726521690561632e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2798 }, { "completion_length": 522.25, "epoch": 0.7757760532150776, "grad_norm": 0.3825426995754242, "kl": 0.15862640738487244, "learning_rate": 4.7263225617208885e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2799 }, { "completion_length": 442.25, "epoch": 0.7760532150776053, "grad_norm": 0.0, "kl": 0.1251528561115265, "learning_rate": 4.726123364608094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2800 }, { "completion_length": 461.0, "epoch": 0.7763303769401331, "grad_norm": 0.0, "kl": 0.13776953518390656, "learning_rate": 4.7259240992293556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2801 }, { "completion_length": 547.5, "epoch": 0.7766075388026608, "grad_norm": 0.0, "kl": 0.12472383677959442, "learning_rate": 4.7257247655907854e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2802 }, { "completion_length": 487.75, "epoch": 0.7768847006651884, "grad_norm": 0.637217104434967, "kl": 0.1252019703388214, "learning_rate": 4.725525363698495e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2803 }, { "completion_length": 505.5, "epoch": 0.7771618625277162, "grad_norm": 0.3993993401527405, "kl": 0.13170573115348816, "learning_rate": 4.7253258935586e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2804 }, { "completion_length": 503.75, "epoch": 0.7774390243902439, "grad_norm": 0.4029022455215454, "kl": 0.15546779334545135, "learning_rate": 4.725126355177216e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2805 }, { "completion_length": 438.25, "epoch": 0.7777161862527716, "grad_norm": 0.39456719160079956, "kl": 0.248023122549057, "learning_rate": 4.724926748560464e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2806 }, { "completion_length": 470.0, "epoch": 0.7779933481152993, "grad_norm": 0.38493019342422485, "kl": 0.13927362859249115, "learning_rate": 4.724727073714463e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2807 }, { "completion_length": 447.5, "epoch": 0.7782705099778271, "grad_norm": 0.0, "kl": 0.11916324496269226, "learning_rate": 4.724527330645338e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2808 }, { "completion_length": 397.0, "epoch": 0.7785476718403548, "grad_norm": 0.0, "kl": 0.1325172632932663, "learning_rate": 4.724327519359214e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2809 }, { "completion_length": 381.0, "epoch": 0.7788248337028825, "grad_norm": 0.0, "kl": 0.12682375311851501, "learning_rate": 4.724127639862218e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2810 }, { "completion_length": 419.25, "epoch": 0.7791019955654102, "grad_norm": 0.46013838052749634, "kl": 0.22850576043128967, "learning_rate": 4.7239276921604796e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2811 }, { "completion_length": 467.25, "epoch": 0.779379157427938, "grad_norm": 0.0, "kl": 0.2052595168352127, "learning_rate": 4.723727676260129e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2812 }, { "completion_length": 446.0, "epoch": 0.7796563192904656, "grad_norm": 0.0, "kl": 0.12764592468738556, "learning_rate": 4.723527592167302e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2813 }, { "completion_length": 457.0, "epoch": 0.7799334811529933, "grad_norm": 0.0, "kl": 0.15384843945503235, "learning_rate": 4.723327439888132e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2814 }, { "completion_length": 427.0, "epoch": 0.780210643015521, "grad_norm": 0.0, "kl": 0.18406207859516144, "learning_rate": 4.72312721942876e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2815 }, { "completion_length": 464.5, "epoch": 0.7804878048780488, "grad_norm": 0.0, "kl": 0.12167036533355713, "learning_rate": 4.7229269307953235e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2816 }, { "completion_length": 477.5, "epoch": 0.7807649667405765, "grad_norm": 0.0, "kl": 0.10901821404695511, "learning_rate": 4.722726573993965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2817 }, { "completion_length": 372.25, "epoch": 0.7810421286031042, "grad_norm": 0.6433279514312744, "kl": 0.12166029959917068, "learning_rate": 4.722526149030829e-06, "loss": 0.0, "reward": 2.09375, "reward_std": 2.7336158752441406, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 2818 }, { "completion_length": 481.5, "epoch": 0.781319290465632, "grad_norm": 0.4297037422657013, "kl": 0.11640319228172302, "learning_rate": 4.7223256559120614e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2819 }, { "completion_length": 394.0, "epoch": 0.7815964523281597, "grad_norm": 0.49574998021125793, "kl": 0.1417064219713211, "learning_rate": 4.72212509464381e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2820 }, { "completion_length": 432.5, "epoch": 0.7818736141906873, "grad_norm": 0.0, "kl": 0.33476191759109497, "learning_rate": 4.7219244652322265e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2821 }, { "completion_length": 385.75, "epoch": 0.782150776053215, "grad_norm": 0.38278308510780334, "kl": 0.327230840921402, "learning_rate": 4.721723767683463e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2822 }, { "completion_length": 438.25, "epoch": 0.7824279379157428, "grad_norm": 0.0, "kl": 0.12351718544960022, "learning_rate": 4.7215230020036726e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2823 }, { "completion_length": 431.75, "epoch": 0.7827050997782705, "grad_norm": 0.0, "kl": 0.15334220230579376, "learning_rate": 4.721322168199014e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2824 }, { "completion_length": 402.25, "epoch": 0.7829822616407982, "grad_norm": 0.48557984828948975, "kl": 0.13493819534778595, "learning_rate": 4.721121266275644e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2825 }, { "completion_length": 457.25, "epoch": 0.7832594235033259, "grad_norm": 0.4080545902252197, "kl": 0.1254757195711136, "learning_rate": 4.720920296239725e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2826 }, { "completion_length": 459.5, "epoch": 0.7835365853658537, "grad_norm": 0.0, "kl": 0.13501423597335815, "learning_rate": 4.7207192580974195e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2827 }, { "completion_length": 403.5, "epoch": 0.7838137472283814, "grad_norm": 0.5079309940338135, "kl": 0.13678205013275146, "learning_rate": 4.720518151854891e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2828 }, { "completion_length": 461.75, "epoch": 0.7840909090909091, "grad_norm": 0.0, "kl": 0.1426335722208023, "learning_rate": 4.72031697751831e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2829 }, { "completion_length": 511.5, "epoch": 0.7843680709534369, "grad_norm": 0.0, "kl": 0.15332099795341492, "learning_rate": 4.720115735093843e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2830 }, { "completion_length": 393.5, "epoch": 0.7846452328159645, "grad_norm": 0.5417340397834778, "kl": 0.15697386860847473, "learning_rate": 4.7199144245876614e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2831 }, { "completion_length": 408.75, "epoch": 0.7849223946784922, "grad_norm": 0.0, "kl": 0.13406908512115479, "learning_rate": 4.7197130460059385e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2832 }, { "completion_length": 426.5, "epoch": 0.7851995565410199, "grad_norm": 0.41503509879112244, "kl": 0.10658153146505356, "learning_rate": 4.719511599354851e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2833 }, { "completion_length": 410.0, "epoch": 0.7854767184035477, "grad_norm": 0.4259068965911865, "kl": 0.12802466750144958, "learning_rate": 4.719310084640575e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2834 }, { "completion_length": 424.75, "epoch": 0.7857538802660754, "grad_norm": 0.4031946659088135, "kl": 0.11840923130512238, "learning_rate": 4.719108501869292e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2835 }, { "completion_length": 424.25, "epoch": 0.7860310421286031, "grad_norm": 0.0, "kl": 0.13770508766174316, "learning_rate": 4.718906851047182e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2836 }, { "completion_length": 451.5, "epoch": 0.7863082039911308, "grad_norm": 0.3682860732078552, "kl": 0.13891063630580902, "learning_rate": 4.718705132180428e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2837 }, { "completion_length": 446.5, "epoch": 0.7865853658536586, "grad_norm": 0.0, "kl": 0.14703583717346191, "learning_rate": 4.718503345275219e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2838 }, { "completion_length": 481.75, "epoch": 0.7868625277161863, "grad_norm": 0.0, "kl": 0.17392928898334503, "learning_rate": 4.71830149033774e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2839 }, { "completion_length": 411.75, "epoch": 0.7871396895787139, "grad_norm": 0.44816625118255615, "kl": 0.1485663652420044, "learning_rate": 4.718099567374182e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2840 }, { "completion_length": 411.5, "epoch": 0.7874168514412417, "grad_norm": 0.0, "kl": 0.1570446491241455, "learning_rate": 4.717897576390739e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2841 }, { "completion_length": 488.75, "epoch": 0.7876940133037694, "grad_norm": 0.0, "kl": 0.11862964928150177, "learning_rate": 4.717695517393602e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2842 }, { "completion_length": 448.5, "epoch": 0.7879711751662971, "grad_norm": 0.39600449800491333, "kl": 0.20330488681793213, "learning_rate": 4.7174933903889695e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2843 }, { "completion_length": 478.75, "epoch": 0.7882483370288248, "grad_norm": 0.0, "kl": 0.11350181698799133, "learning_rate": 4.717291195383039e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2844 }, { "completion_length": 397.75, "epoch": 0.7885254988913526, "grad_norm": 0.0, "kl": 0.1485385298728943, "learning_rate": 4.717088932382011e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2845 }, { "completion_length": 473.75, "epoch": 0.7888026607538803, "grad_norm": 0.4256158769130707, "kl": 0.1583905816078186, "learning_rate": 4.716886601392089e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2846 }, { "completion_length": 459.5, "epoch": 0.789079822616408, "grad_norm": 0.0, "kl": 0.12178865820169449, "learning_rate": 4.716684202419477e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2847 }, { "completion_length": 471.0, "epoch": 0.7893569844789357, "grad_norm": 0.0, "kl": 0.13679814338684082, "learning_rate": 4.716481735470381e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2848 }, { "completion_length": 414.5, "epoch": 0.7896341463414634, "grad_norm": 0.0, "kl": 0.12488537281751633, "learning_rate": 4.71627920055101e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2849 }, { "completion_length": 401.0, "epoch": 0.7899113082039911, "grad_norm": 0.0, "kl": 0.12491802871227264, "learning_rate": 4.716076597667577e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2850 }, { "completion_length": 434.5, "epoch": 0.7901884700665188, "grad_norm": 0.0, "kl": 0.14851799607276917, "learning_rate": 4.715873926826293e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2851 }, { "completion_length": 417.0, "epoch": 0.7904656319290465, "grad_norm": 0.0, "kl": 0.14179843664169312, "learning_rate": 4.715671188033373e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2852 }, { "completion_length": 427.75, "epoch": 0.7907427937915743, "grad_norm": 0.4064454436302185, "kl": 0.10614871978759766, "learning_rate": 4.715468381295035e-06, "loss": -0.0, "reward": 1.46875, "reward_std": 0.5625, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 2853 }, { "completion_length": 447.25, "epoch": 0.791019955654102, "grad_norm": 0.46065935492515564, "kl": 0.12532588839530945, "learning_rate": 4.715265506617497e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2854 }, { "completion_length": 522.5, "epoch": 0.7912971175166297, "grad_norm": 0.0, "kl": 0.11474697291851044, "learning_rate": 4.715062564006982e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2855 }, { "completion_length": 458.0, "epoch": 0.7915742793791575, "grad_norm": 0.0, "kl": 0.13127292692661285, "learning_rate": 4.7148595534697125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2856 }, { "completion_length": 518.0, "epoch": 0.7918514412416852, "grad_norm": 0.39640671014785767, "kl": 0.12136276811361313, "learning_rate": 4.714656475011914e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2857 }, { "completion_length": 439.0, "epoch": 0.7921286031042128, "grad_norm": 0.45932987332344055, "kl": 0.13345281779766083, "learning_rate": 4.714453328639814e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2858 }, { "completion_length": 509.25, "epoch": 0.7924057649667405, "grad_norm": 0.354867160320282, "kl": 0.11744096130132675, "learning_rate": 4.714250114359642e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2859 }, { "completion_length": 432.5, "epoch": 0.7926829268292683, "grad_norm": 0.4939744770526886, "kl": 0.13726802170276642, "learning_rate": 4.714046832177631e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2860 }, { "completion_length": 435.25, "epoch": 0.792960088691796, "grad_norm": 0.0, "kl": 0.11736609786748886, "learning_rate": 4.713843482100013e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2861 }, { "completion_length": 422.25, "epoch": 0.7932372505543237, "grad_norm": 0.43922024965286255, "kl": 0.13313569128513336, "learning_rate": 4.7136400641330245e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2862 }, { "completion_length": 437.5, "epoch": 0.7935144124168514, "grad_norm": 0.458294153213501, "kl": 0.13362956047058105, "learning_rate": 4.713436578282905e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2863 }, { "completion_length": 459.0, "epoch": 0.7937915742793792, "grad_norm": 0.0, "kl": 0.15053234994411469, "learning_rate": 4.713233024555893e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2864 }, { "completion_length": 451.25, "epoch": 0.7940687361419069, "grad_norm": 0.43666380643844604, "kl": 0.11573527008295059, "learning_rate": 4.71302940295823e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2865 }, { "completion_length": 418.0, "epoch": 0.7943458980044346, "grad_norm": 0.0, "kl": 0.13606657087802887, "learning_rate": 4.712825713496162e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2866 }, { "completion_length": 497.25, "epoch": 0.7946230598669624, "grad_norm": 0.0, "kl": 0.11706945300102234, "learning_rate": 4.712621956175935e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2867 }, { "completion_length": 430.5, "epoch": 0.79490022172949, "grad_norm": 0.38052865862846375, "kl": 0.1303163319826126, "learning_rate": 4.712418131003795e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2868 }, { "completion_length": 434.5, "epoch": 0.7951773835920177, "grad_norm": 0.4626181125640869, "kl": 0.10885006189346313, "learning_rate": 4.712214237985996e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2869 }, { "completion_length": 455.0, "epoch": 0.7954545454545454, "grad_norm": 0.47670987248420715, "kl": 0.1808045357465744, "learning_rate": 4.712010277128789e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2870 }, { "completion_length": 454.75, "epoch": 0.7957317073170732, "grad_norm": 0.0, "kl": 0.12753711640834808, "learning_rate": 4.711806248438428e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2871 }, { "completion_length": 486.75, "epoch": 0.7960088691796009, "grad_norm": 0.3920363187789917, "kl": 0.18633660674095154, "learning_rate": 4.711602151921169e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2872 }, { "completion_length": 480.5, "epoch": 0.7962860310421286, "grad_norm": 0.472140371799469, "kl": 0.33847400546073914, "learning_rate": 4.7113979875832736e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2873 }, { "completion_length": 484.25, "epoch": 0.7965631929046563, "grad_norm": 0.0, "kl": 0.12523417174816132, "learning_rate": 4.711193755431001e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2874 }, { "completion_length": 436.75, "epoch": 0.7968403547671841, "grad_norm": 0.0, "kl": 0.12972193956375122, "learning_rate": 4.710989455470614e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2875 }, { "completion_length": 451.5, "epoch": 0.7971175166297118, "grad_norm": 0.49274227023124695, "kl": 0.1383800208568573, "learning_rate": 4.710785087708378e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2876 }, { "completion_length": 391.0, "epoch": 0.7973946784922394, "grad_norm": 0.590927004814148, "kl": 0.2060813158750534, "learning_rate": 4.71058065215056e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2877 }, { "completion_length": 479.75, "epoch": 0.7976718403547672, "grad_norm": 0.3896282911300659, "kl": 0.11327727884054184, "learning_rate": 4.710376148803429e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2878 }, { "completion_length": 473.0, "epoch": 0.7979490022172949, "grad_norm": 0.41329893469810486, "kl": 0.1188504546880722, "learning_rate": 4.7101715776732565e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2879 }, { "completion_length": 447.5, "epoch": 0.7982261640798226, "grad_norm": 0.0, "kl": 0.26226866245269775, "learning_rate": 4.709966938766316e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2880 }, { "completion_length": 445.25, "epoch": 0.7985033259423503, "grad_norm": 0.38458898663520813, "kl": 0.14273352921009064, "learning_rate": 4.709762232088882e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2881 }, { "completion_length": 423.25, "epoch": 0.7987804878048781, "grad_norm": 0.0, "kl": 0.13930271565914154, "learning_rate": 4.709557457647234e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2882 }, { "completion_length": 442.0, "epoch": 0.7990576496674058, "grad_norm": 0.0, "kl": 0.1135002076625824, "learning_rate": 4.709352615447648e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2883 }, { "completion_length": 481.0, "epoch": 0.7993348115299335, "grad_norm": 0.3724234700202942, "kl": 0.13930480182170868, "learning_rate": 4.7091477054964105e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2884 }, { "completion_length": 408.0, "epoch": 0.7996119733924612, "grad_norm": 0.43691837787628174, "kl": 0.16589650511741638, "learning_rate": 4.708942727799802e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2885 }, { "completion_length": 475.0, "epoch": 0.799889135254989, "grad_norm": 0.0, "kl": 0.14550194144248962, "learning_rate": 4.7087376823641086e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2886 }, { "completion_length": 440.75, "epoch": 0.8001662971175166, "grad_norm": 0.47323140501976013, "kl": 0.1590711772441864, "learning_rate": 4.708532569195619e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2887 }, { "completion_length": 431.0, "epoch": 0.8004434589800443, "grad_norm": 0.41812098026275635, "kl": 0.13411404192447662, "learning_rate": 4.708327388300622e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2888 }, { "completion_length": 534.5, "epoch": 0.8007206208425721, "grad_norm": 0.0, "kl": 0.17550742626190186, "learning_rate": 4.708122139685411e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2889 }, { "completion_length": 413.25, "epoch": 0.8009977827050998, "grad_norm": 0.0, "kl": 0.10862930119037628, "learning_rate": 4.70791682335628e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2890 }, { "completion_length": 438.75, "epoch": 0.8012749445676275, "grad_norm": 0.0, "kl": 0.15892668068408966, "learning_rate": 4.707711439319524e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2891 }, { "completion_length": 471.5, "epoch": 0.8015521064301552, "grad_norm": 0.0, "kl": 0.10851491242647171, "learning_rate": 4.7075059875814424e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2892 }, { "completion_length": 502.0, "epoch": 0.801829268292683, "grad_norm": 0.3879382610321045, "kl": 0.11772440373897552, "learning_rate": 4.707300468148334e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2893 }, { "completion_length": 516.5, "epoch": 0.8021064301552107, "grad_norm": 0.40571779012680054, "kl": 0.10169193148612976, "learning_rate": 4.707094881026503e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2894 }, { "completion_length": 442.25, "epoch": 0.8023835920177383, "grad_norm": 0.0, "kl": 0.095121368765831, "learning_rate": 4.706889226222254e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2895 }, { "completion_length": 450.0, "epoch": 0.802660753880266, "grad_norm": 0.0, "kl": 0.128846675157547, "learning_rate": 4.7066835037418935e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2896 }, { "completion_length": 468.5, "epoch": 0.8029379157427938, "grad_norm": 0.0, "kl": 0.15220071375370026, "learning_rate": 4.7064777135917284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2897 }, { "completion_length": 446.75, "epoch": 0.8032150776053215, "grad_norm": 0.4752250015735626, "kl": 0.11414887756109238, "learning_rate": 4.70627185577807e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2898 }, { "completion_length": 429.0, "epoch": 0.8034922394678492, "grad_norm": 0.0, "kl": 0.1315457969903946, "learning_rate": 4.706065930307233e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2899 }, { "completion_length": 406.0, "epoch": 0.8037694013303769, "grad_norm": 0.5409906506538391, "kl": 0.16842705011367798, "learning_rate": 4.70585993718553e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2900 }, { "completion_length": 458.25, "epoch": 0.8040465631929047, "grad_norm": 0.582050085067749, "kl": 0.12171610444784164, "learning_rate": 4.705653876419279e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2901 }, { "completion_length": 448.0, "epoch": 0.8043237250554324, "grad_norm": 0.43818822503089905, "kl": 0.10255733132362366, "learning_rate": 4.705447748014799e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2902 }, { "completion_length": 451.75, "epoch": 0.8046008869179601, "grad_norm": 0.4495324194431305, "kl": 0.1645759642124176, "learning_rate": 4.705241551978411e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2903 }, { "completion_length": 409.75, "epoch": 0.8048780487804879, "grad_norm": 0.0, "kl": 0.1543678343296051, "learning_rate": 4.705035288316439e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2904 }, { "completion_length": 377.0, "epoch": 0.8051552106430155, "grad_norm": 0.0, "kl": 0.13334377110004425, "learning_rate": 4.704828957035206e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2905 }, { "completion_length": 422.0, "epoch": 0.8054323725055432, "grad_norm": 0.4321475923061371, "kl": 0.15697985887527466, "learning_rate": 4.704622558141042e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2906 }, { "completion_length": 435.0, "epoch": 0.8057095343680709, "grad_norm": 0.0, "kl": 0.14007683098316193, "learning_rate": 4.704416091640273e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2907 }, { "completion_length": 433.0, "epoch": 0.8059866962305987, "grad_norm": 0.4750416874885559, "kl": 0.20729270577430725, "learning_rate": 4.704209557539235e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2908 }, { "completion_length": 411.0, "epoch": 0.8062638580931264, "grad_norm": 0.0, "kl": 0.12256506830453873, "learning_rate": 4.704002955844258e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2909 }, { "completion_length": 396.75, "epoch": 0.8065410199556541, "grad_norm": 0.4104202091693878, "kl": 0.12510044872760773, "learning_rate": 4.7037962865616795e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2910 }, { "completion_length": 425.75, "epoch": 0.8068181818181818, "grad_norm": 0.0, "kl": 0.1302625685930252, "learning_rate": 4.703589549697836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2911 }, { "completion_length": 444.75, "epoch": 0.8070953436807096, "grad_norm": 0.0, "kl": 0.1631203293800354, "learning_rate": 4.703382745259068e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2912 }, { "completion_length": 435.5, "epoch": 0.8073725055432373, "grad_norm": 0.0, "kl": 0.12500539422035217, "learning_rate": 4.703175873251716e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2913 }, { "completion_length": 424.0, "epoch": 0.8076496674057649, "grad_norm": 0.0, "kl": 0.11904346942901611, "learning_rate": 4.702968933682126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2914 }, { "completion_length": 457.0, "epoch": 0.8079268292682927, "grad_norm": 0.4290720820426941, "kl": 0.11529220640659332, "learning_rate": 4.702761926556642e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2915 }, { "completion_length": 462.75, "epoch": 0.8082039911308204, "grad_norm": 0.0, "kl": 0.10905130207538605, "learning_rate": 4.702554851881614e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2916 }, { "completion_length": 457.75, "epoch": 0.8084811529933481, "grad_norm": 0.43890246748924255, "kl": 0.1746593862771988, "learning_rate": 4.702347709663391e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2917 }, { "completion_length": 396.5, "epoch": 0.8087583148558758, "grad_norm": 0.0, "kl": 0.13687565922737122, "learning_rate": 4.702140499908325e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2918 }, { "completion_length": 410.75, "epoch": 0.8090354767184036, "grad_norm": 0.5193590521812439, "kl": 0.1631307601928711, "learning_rate": 4.701933222622771e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2919 }, { "completion_length": 415.25, "epoch": 0.8093126385809313, "grad_norm": 0.4253939986228943, "kl": 0.14288729429244995, "learning_rate": 4.701725877813084e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2920 }, { "completion_length": 402.5, "epoch": 0.809589800443459, "grad_norm": 0.0, "kl": 0.13928943872451782, "learning_rate": 4.701518465485624e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2921 }, { "completion_length": 418.5, "epoch": 0.8098669623059866, "grad_norm": 0.4780924618244171, "kl": 0.12079835683107376, "learning_rate": 4.70131098564675e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2922 }, { "completion_length": 440.75, "epoch": 0.8101441241685144, "grad_norm": 0.0, "kl": 0.11863046139478683, "learning_rate": 4.701103438302827e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2923 }, { "completion_length": 417.0, "epoch": 0.8104212860310421, "grad_norm": 0.428865522146225, "kl": 0.11811892688274384, "learning_rate": 4.700895823460216e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2924 }, { "completion_length": 397.75, "epoch": 0.8106984478935698, "grad_norm": 0.0, "kl": 0.1660279780626297, "learning_rate": 4.700688141125286e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2925 }, { "completion_length": 445.0, "epoch": 0.8109756097560976, "grad_norm": 0.5023629665374756, "kl": 0.13206171989440918, "learning_rate": 4.700480391304406e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2926 }, { "completion_length": 452.25, "epoch": 0.8112527716186253, "grad_norm": 0.46695542335510254, "kl": 0.135740727186203, "learning_rate": 4.700272574003946e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2927 }, { "completion_length": 536.25, "epoch": 0.811529933481153, "grad_norm": 0.3249543607234955, "kl": 0.1050691232085228, "learning_rate": 4.700064689230278e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2928 }, { "completion_length": 440.25, "epoch": 0.8118070953436807, "grad_norm": 0.39428502321243286, "kl": 0.11896177381277084, "learning_rate": 4.6998567369897795e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2929 }, { "completion_length": 416.5, "epoch": 0.8120842572062085, "grad_norm": 0.0, "kl": 0.13393092155456543, "learning_rate": 4.699648717288825e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2930 }, { "completion_length": 379.25, "epoch": 0.8123614190687362, "grad_norm": 0.0, "kl": 0.11800359189510345, "learning_rate": 4.699440630133794e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2931 }, { "completion_length": 406.75, "epoch": 0.8126385809312638, "grad_norm": 0.4568706750869751, "kl": 0.11562254279851913, "learning_rate": 4.69923247553107e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2932 }, { "completion_length": 448.25, "epoch": 0.8129157427937915, "grad_norm": 0.34854403138160706, "kl": 0.14379234611988068, "learning_rate": 4.699024253487035e-06, "loss": -0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2933 }, { "completion_length": 488.0, "epoch": 0.8131929046563193, "grad_norm": 0.38869598507881165, "kl": 0.12867289781570435, "learning_rate": 4.698815964008072e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2934 }, { "completion_length": 434.75, "epoch": 0.813470066518847, "grad_norm": 0.39403846859931946, "kl": 0.14805585145950317, "learning_rate": 4.698607607100571e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2935 }, { "completion_length": 400.0, "epoch": 0.8137472283813747, "grad_norm": 0.0, "kl": 0.13775524497032166, "learning_rate": 4.698399182770921e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2936 }, { "completion_length": 447.5, "epoch": 0.8140243902439024, "grad_norm": 0.0, "kl": 0.12306170910596848, "learning_rate": 4.698190691025513e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2937 }, { "completion_length": 422.5, "epoch": 0.8143015521064302, "grad_norm": 0.41685009002685547, "kl": 0.16195419430732727, "learning_rate": 4.697982131870741e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2938 }, { "completion_length": 445.25, "epoch": 0.8145787139689579, "grad_norm": 0.3794899880886078, "kl": 0.2043595165014267, "learning_rate": 4.697773505312999e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2939 }, { "completion_length": 469.25, "epoch": 0.8148558758314856, "grad_norm": 0.39576414227485657, "kl": 0.17060443758964539, "learning_rate": 4.697564811358687e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2940 }, { "completion_length": 480.25, "epoch": 0.8151330376940134, "grad_norm": 0.0, "kl": 0.1360616385936737, "learning_rate": 4.6973560500142045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2941 }, { "completion_length": 398.5, "epoch": 0.815410199556541, "grad_norm": 0.0, "kl": 0.15200138092041016, "learning_rate": 4.697147221285953e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2942 }, { "completion_length": 363.25, "epoch": 0.8156873614190687, "grad_norm": 0.0, "kl": 0.1267082542181015, "learning_rate": 4.696938325180335e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2943 }, { "completion_length": 384.5, "epoch": 0.8159645232815964, "grad_norm": 0.0, "kl": 0.1311604231595993, "learning_rate": 4.696729361703758e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2944 }, { "completion_length": 483.5, "epoch": 0.8162416851441242, "grad_norm": 0.0, "kl": 0.1219836175441742, "learning_rate": 4.69652033086263e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2945 }, { "completion_length": 422.0, "epoch": 0.8165188470066519, "grad_norm": 0.46705642342567444, "kl": 0.11066901683807373, "learning_rate": 4.69631123266336e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2946 }, { "completion_length": 440.75, "epoch": 0.8167960088691796, "grad_norm": 0.41803181171417236, "kl": 0.17912557721138, "learning_rate": 4.69610206711236e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2947 }, { "completion_length": 425.5, "epoch": 0.8170731707317073, "grad_norm": 0.5124017596244812, "kl": 0.1840442270040512, "learning_rate": 4.695892834216047e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2948 }, { "completion_length": 403.25, "epoch": 0.8173503325942351, "grad_norm": 0.0, "kl": 0.16012056171894073, "learning_rate": 4.695683533980835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2949 }, { "completion_length": 460.0, "epoch": 0.8176274944567627, "grad_norm": 0.5076830983161926, "kl": 0.13598233461380005, "learning_rate": 4.695474166413143e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2950 }, { "completion_length": 435.0, "epoch": 0.8179046563192904, "grad_norm": 0.48167693614959717, "kl": 0.1715787798166275, "learning_rate": 4.695264731519391e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2951 }, { "completion_length": 400.0, "epoch": 0.8181818181818182, "grad_norm": 0.5010607242584229, "kl": 0.134335458278656, "learning_rate": 4.695055229306001e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2952 }, { "completion_length": 479.75, "epoch": 0.8184589800443459, "grad_norm": 0.3958872854709625, "kl": 0.11218290776014328, "learning_rate": 4.6948456597794e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2953 }, { "completion_length": 414.75, "epoch": 0.8187361419068736, "grad_norm": 0.0, "kl": 0.2314564734697342, "learning_rate": 4.694636022946012e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2954 }, { "completion_length": 433.0, "epoch": 0.8190133037694013, "grad_norm": 0.0, "kl": 0.13104990124702454, "learning_rate": 4.694426318812266e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2955 }, { "completion_length": 373.75, "epoch": 0.8192904656319291, "grad_norm": 0.5277474522590637, "kl": 0.1657319962978363, "learning_rate": 4.694216547384594e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2956 }, { "completion_length": 372.75, "epoch": 0.8195676274944568, "grad_norm": 0.0, "kl": 0.14330437779426575, "learning_rate": 4.694006708669428e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2957 }, { "completion_length": 506.5, "epoch": 0.8198447893569845, "grad_norm": 0.0, "kl": 0.10901360958814621, "learning_rate": 4.693796802673204e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2958 }, { "completion_length": 432.5, "epoch": 0.8201219512195121, "grad_norm": 0.41884171962738037, "kl": 0.1404942274093628, "learning_rate": 4.693586829402357e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2959 }, { "completion_length": 390.25, "epoch": 0.8203991130820399, "grad_norm": 0.4823089838027954, "kl": 0.2966670095920563, "learning_rate": 4.693376788863327e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2960 }, { "completion_length": 456.75, "epoch": 0.8206762749445676, "grad_norm": 0.4510219097137451, "kl": 0.11070430278778076, "learning_rate": 4.693166681062557e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2961 }, { "completion_length": 427.0, "epoch": 0.8209534368070953, "grad_norm": 0.4649967849254608, "kl": 0.14613626897335052, "learning_rate": 4.692956506006486e-06, "loss": 0.0, "reward": 1.625, "reward_std": 0.25, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 2962 }, { "completion_length": 421.75, "epoch": 0.8212305986696231, "grad_norm": 0.45008501410484314, "kl": 0.13221940398216248, "learning_rate": 4.692746263701562e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2963 }, { "completion_length": 444.5, "epoch": 0.8215077605321508, "grad_norm": 0.0, "kl": 0.11770913749933243, "learning_rate": 4.692535954154232e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2964 }, { "completion_length": 430.75, "epoch": 0.8217849223946785, "grad_norm": 0.0, "kl": 0.16074468195438385, "learning_rate": 4.692325577370945e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2965 }, { "completion_length": 431.5, "epoch": 0.8220620842572062, "grad_norm": 0.4098697304725647, "kl": 0.16101504862308502, "learning_rate": 4.692115133358152e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2966 }, { "completion_length": 387.5, "epoch": 0.822339246119734, "grad_norm": 0.0, "kl": 0.12414295971393585, "learning_rate": 4.691904622122307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2967 }, { "completion_length": 484.25, "epoch": 0.8226164079822617, "grad_norm": 0.4467189610004425, "kl": 0.1242387592792511, "learning_rate": 4.691694043669866e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2968 }, { "completion_length": 447.0, "epoch": 0.8228935698447893, "grad_norm": 0.0, "kl": 0.12457331269979477, "learning_rate": 4.6914833980072845e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2969 }, { "completion_length": 460.75, "epoch": 0.823170731707317, "grad_norm": 0.4060922861099243, "kl": 0.1132277250289917, "learning_rate": 4.691272685141025e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2970 }, { "completion_length": 463.0, "epoch": 0.8234478935698448, "grad_norm": 0.0, "kl": 0.11920872330665588, "learning_rate": 4.691061905077547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2971 }, { "completion_length": 455.0, "epoch": 0.8237250554323725, "grad_norm": 0.4218578040599823, "kl": 0.1272214651107788, "learning_rate": 4.690851057823314e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2972 }, { "completion_length": 449.25, "epoch": 0.8240022172949002, "grad_norm": 0.0, "kl": 0.097225122153759, "learning_rate": 4.6906401433847944e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2973 }, { "completion_length": 428.75, "epoch": 0.8242793791574279, "grad_norm": 0.0, "kl": 0.15558987855911255, "learning_rate": 4.690429161768454e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2974 }, { "completion_length": 463.5, "epoch": 0.8245565410199557, "grad_norm": 0.0, "kl": 0.13513675332069397, "learning_rate": 4.690218112980763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2975 }, { "completion_length": 402.25, "epoch": 0.8248337028824834, "grad_norm": 0.0, "kl": 0.14314444363117218, "learning_rate": 4.690006997028193e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2976 }, { "completion_length": 445.25, "epoch": 0.825110864745011, "grad_norm": 0.48305830359458923, "kl": 0.13952986896038055, "learning_rate": 4.68979581391722e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2977 }, { "completion_length": 437.5, "epoch": 0.8253880266075388, "grad_norm": 0.0, "kl": 0.11947115510702133, "learning_rate": 4.689584563654318e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2978 }, { "completion_length": 428.75, "epoch": 0.8256651884700665, "grad_norm": 0.4436224699020386, "kl": 0.13466551899909973, "learning_rate": 4.689373246245966e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2979 }, { "completion_length": 360.25, "epoch": 0.8259423503325942, "grad_norm": 0.0, "kl": 0.1701090931892395, "learning_rate": 4.689161861698643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2980 }, { "completion_length": 404.5, "epoch": 0.8262195121951219, "grad_norm": 0.0, "kl": 0.20251889526844025, "learning_rate": 4.688950410018834e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2981 }, { "completion_length": 384.75, "epoch": 0.8264966740576497, "grad_norm": 0.0, "kl": 0.13219569623470306, "learning_rate": 4.6887388912130206e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2982 }, { "completion_length": 417.5, "epoch": 0.8267738359201774, "grad_norm": 0.0, "kl": 0.16298386454582214, "learning_rate": 4.688527305287691e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2983 }, { "completion_length": 418.25, "epoch": 0.8270509977827051, "grad_norm": 0.0, "kl": 0.14076343178749084, "learning_rate": 4.688315652249332e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2984 }, { "completion_length": 410.75, "epoch": 0.8273281596452328, "grad_norm": 0.0, "kl": 0.31036949157714844, "learning_rate": 4.688103932104436e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2985 }, { "completion_length": 420.25, "epoch": 0.8276053215077606, "grad_norm": 0.45374795794487, "kl": 0.13983994722366333, "learning_rate": 4.687892144859495e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2986 }, { "completion_length": 483.75, "epoch": 0.8278824833702882, "grad_norm": 0.4284282922744751, "kl": 0.13472816348075867, "learning_rate": 4.687680290521003e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2987 }, { "completion_length": 447.25, "epoch": 0.8281596452328159, "grad_norm": 0.440050333738327, "kl": 0.11992736160755157, "learning_rate": 4.687468369095457e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2988 }, { "completion_length": 471.25, "epoch": 0.8284368070953437, "grad_norm": 0.7083611488342285, "kl": 0.10271573066711426, "learning_rate": 4.687256380589356e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2989 }, { "completion_length": 442.25, "epoch": 0.8287139689578714, "grad_norm": 0.4880125820636749, "kl": 0.11123863607645035, "learning_rate": 4.6870443250091996e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2990 }, { "completion_length": 405.5, "epoch": 0.8289911308203991, "grad_norm": 0.0, "kl": 0.11875835806131363, "learning_rate": 4.686832202361492e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2991 }, { "completion_length": 431.25, "epoch": 0.8292682926829268, "grad_norm": 0.4172719120979309, "kl": 0.19709499180316925, "learning_rate": 4.686620012652738e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2992 }, { "completion_length": 431.25, "epoch": 0.8295454545454546, "grad_norm": 0.43134641647338867, "kl": 0.14814724028110504, "learning_rate": 4.686407755889445e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2993 }, { "completion_length": 449.5, "epoch": 0.8298226164079823, "grad_norm": 0.42625054717063904, "kl": 0.15737535059452057, "learning_rate": 4.68619543207812e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2994 }, { "completion_length": 461.25, "epoch": 0.83009977827051, "grad_norm": 0.0, "kl": 0.16334201395511627, "learning_rate": 4.685983041225276e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2995 }, { "completion_length": 432.5, "epoch": 0.8303769401330376, "grad_norm": 0.39374515414237976, "kl": 0.11065856367349625, "learning_rate": 4.6857705833374255e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2996 }, { "completion_length": 449.25, "epoch": 0.8306541019955654, "grad_norm": 0.0, "kl": 0.12439177930355072, "learning_rate": 4.685558058421084e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2997 }, { "completion_length": 440.25, "epoch": 0.8309312638580931, "grad_norm": 0.43185779452323914, "kl": 0.1133713647723198, "learning_rate": 4.685345466482769e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2998 }, { "completion_length": 383.25, "epoch": 0.8312084257206208, "grad_norm": 0.47926485538482666, "kl": 0.13506177067756653, "learning_rate": 4.6851328075289995e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 2999 }, { "completion_length": 388.0, "epoch": 0.8314855875831486, "grad_norm": 0.46605202555656433, "kl": 20.28241729736328, "learning_rate": 4.684920081566295e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3000 }, { "completion_length": 532.5, "epoch": 0.8317627494456763, "grad_norm": 0.0, "kl": 0.13424819707870483, "learning_rate": 4.684707288601182e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3001 }, { "completion_length": 463.75, "epoch": 0.832039911308204, "grad_norm": 0.4281599521636963, "kl": 0.13379357755184174, "learning_rate": 4.6844944286401844e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3002 }, { "completion_length": 414.5, "epoch": 0.8323170731707317, "grad_norm": 0.0, "kl": 0.140704944729805, "learning_rate": 4.68428150168983e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3003 }, { "completion_length": 435.25, "epoch": 0.8325942350332595, "grad_norm": 0.45952147245407104, "kl": 0.1186218187212944, "learning_rate": 4.6840685077566486e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3004 }, { "completion_length": 455.0, "epoch": 0.8328713968957872, "grad_norm": 0.4156615734100342, "kl": 0.15182772278785706, "learning_rate": 4.683855446847171e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3005 }, { "completion_length": 546.25, "epoch": 0.8331485587583148, "grad_norm": 0.0, "kl": 0.09894164651632309, "learning_rate": 4.6836423189679316e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3006 }, { "completion_length": 436.0, "epoch": 0.8334257206208425, "grad_norm": 0.4350578784942627, "kl": 0.13890142738819122, "learning_rate": 4.683429124125466e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3007 }, { "completion_length": 475.75, "epoch": 0.8337028824833703, "grad_norm": 0.0, "kl": 0.1207270547747612, "learning_rate": 4.6832158623263115e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3008 }, { "completion_length": 436.0, "epoch": 0.833980044345898, "grad_norm": 0.0, "kl": 0.11984756588935852, "learning_rate": 4.683002533577009e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3009 }, { "completion_length": 428.5, "epoch": 0.8342572062084257, "grad_norm": 0.0, "kl": 0.1060350313782692, "learning_rate": 4.682789137884101e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3010 }, { "completion_length": 399.0, "epoch": 0.8345343680709535, "grad_norm": 0.46462538838386536, "kl": 0.12905356287956238, "learning_rate": 4.682575675254128e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3011 }, { "completion_length": 442.5, "epoch": 0.8348115299334812, "grad_norm": 0.0, "kl": 0.13221101462841034, "learning_rate": 4.68236214569364e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3012 }, { "completion_length": 454.5, "epoch": 0.8350886917960089, "grad_norm": 0.0, "kl": 0.12765075266361237, "learning_rate": 4.682148549209183e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3013 }, { "completion_length": 439.0, "epoch": 0.8353658536585366, "grad_norm": 0.0, "kl": 0.15940894186496735, "learning_rate": 4.681934885807307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3014 }, { "completion_length": 413.0, "epoch": 0.8356430155210643, "grad_norm": 0.0, "kl": 0.13037949800491333, "learning_rate": 4.681721155494565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3015 }, { "completion_length": 447.0, "epoch": 0.835920177383592, "grad_norm": 0.4320334494113922, "kl": 0.12387462705373764, "learning_rate": 4.6815073582775115e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3016 }, { "completion_length": 404.5, "epoch": 0.8361973392461197, "grad_norm": 0.40434589982032776, "kl": 0.11645831912755966, "learning_rate": 4.681293494162702e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3017 }, { "completion_length": 425.75, "epoch": 0.8364745011086474, "grad_norm": 0.5220381021499634, "kl": 0.2146342694759369, "learning_rate": 4.681079563156694e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3018 }, { "completion_length": 465.5, "epoch": 0.8367516629711752, "grad_norm": 0.4114278256893158, "kl": 0.13317260146141052, "learning_rate": 4.6808655652660495e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3019 }, { "completion_length": 468.75, "epoch": 0.8370288248337029, "grad_norm": 0.42148396372795105, "kl": 0.16061384975910187, "learning_rate": 4.680651500497331e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3020 }, { "completion_length": 408.25, "epoch": 0.8373059866962306, "grad_norm": 0.45526233315467834, "kl": 0.12057936191558838, "learning_rate": 4.680437368857101e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3021 }, { "completion_length": 374.75, "epoch": 0.8375831485587583, "grad_norm": 0.0, "kl": 0.19852548837661743, "learning_rate": 4.6802231703519274e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3022 }, { "completion_length": 416.75, "epoch": 0.8378603104212861, "grad_norm": 0.0, "kl": 0.11173409968614578, "learning_rate": 4.680008904988379e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3023 }, { "completion_length": 431.75, "epoch": 0.8381374722838137, "grad_norm": 0.0, "kl": 0.219156414270401, "learning_rate": 4.679794572773027e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3024 }, { "completion_length": 458.25, "epoch": 0.8384146341463414, "grad_norm": 0.0, "kl": 0.1250428706407547, "learning_rate": 4.679580173712443e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3025 }, { "completion_length": 407.75, "epoch": 0.8386917960088692, "grad_norm": 0.6145135164260864, "kl": 0.13535697758197784, "learning_rate": 4.679365707813201e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3026 }, { "completion_length": 466.5, "epoch": 0.8389689578713969, "grad_norm": 0.4169125556945801, "kl": 0.12202588468790054, "learning_rate": 4.679151175081879e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3027 }, { "completion_length": 484.0, "epoch": 0.8392461197339246, "grad_norm": 0.0, "kl": 0.1534584015607834, "learning_rate": 4.678936575525055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3028 }, { "completion_length": 411.25, "epoch": 0.8395232815964523, "grad_norm": 0.0, "kl": 0.1396733820438385, "learning_rate": 4.678721909149311e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3029 }, { "completion_length": 429.25, "epoch": 0.8398004434589801, "grad_norm": 0.0, "kl": 0.3618123531341553, "learning_rate": 4.678507175961229e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3030 }, { "completion_length": 462.75, "epoch": 0.8400776053215078, "grad_norm": 0.0, "kl": 0.10603909939527512, "learning_rate": 4.678292375967395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3031 }, { "completion_length": 354.75, "epoch": 0.8403547671840355, "grad_norm": 0.0, "kl": 0.15158170461654663, "learning_rate": 4.678077509174394e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3032 }, { "completion_length": 413.0, "epoch": 0.8406319290465631, "grad_norm": 0.0, "kl": 0.6235723495483398, "learning_rate": 4.6778625755888175e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3033 }, { "completion_length": 418.75, "epoch": 0.8409090909090909, "grad_norm": 0.0, "kl": 0.1263161450624466, "learning_rate": 4.677647575217255e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3034 }, { "completion_length": 391.75, "epoch": 0.8411862527716186, "grad_norm": 0.4443229138851166, "kl": 0.15081943571567535, "learning_rate": 4.6774325080663e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3035 }, { "completion_length": 431.5, "epoch": 0.8414634146341463, "grad_norm": 0.4361618757247925, "kl": 0.5858852863311768, "learning_rate": 4.6772173741425485e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3036 }, { "completion_length": 430.75, "epoch": 0.8417405764966741, "grad_norm": 0.5064411163330078, "kl": 0.13641111552715302, "learning_rate": 4.677002173452596e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3037 }, { "completion_length": 427.0, "epoch": 0.8420177383592018, "grad_norm": 0.0, "kl": 0.139200821518898, "learning_rate": 4.676786906003044e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3038 }, { "completion_length": 422.75, "epoch": 0.8422949002217295, "grad_norm": 0.44929689168930054, "kl": 0.5593610405921936, "learning_rate": 4.6765715718004924e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3039 }, { "completion_length": 456.5, "epoch": 0.8425720620842572, "grad_norm": 0.432753324508667, "kl": 0.16417545080184937, "learning_rate": 4.676356170851545e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3040 }, { "completion_length": 419.25, "epoch": 0.842849223946785, "grad_norm": 0.0, "kl": 0.21342095732688904, "learning_rate": 4.676140703162808e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3041 }, { "completion_length": 430.75, "epoch": 0.8431263858093127, "grad_norm": 0.0, "kl": 0.12294408679008484, "learning_rate": 4.675925168740887e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3042 }, { "completion_length": 501.75, "epoch": 0.8434035476718403, "grad_norm": 0.48633700609207153, "kl": 0.11423490196466446, "learning_rate": 4.675709567592393e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3043 }, { "completion_length": 432.0, "epoch": 0.843680709534368, "grad_norm": 0.0, "kl": 0.14070039987564087, "learning_rate": 4.675493899723938e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3044 }, { "completion_length": 444.25, "epoch": 0.8439578713968958, "grad_norm": 0.0, "kl": 0.1524711698293686, "learning_rate": 4.675278165142134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3045 }, { "completion_length": 443.75, "epoch": 0.8442350332594235, "grad_norm": 0.4666922092437744, "kl": 0.13681866228580475, "learning_rate": 4.675062363853599e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3046 }, { "completion_length": 419.25, "epoch": 0.8445121951219512, "grad_norm": 0.0, "kl": 0.22635850310325623, "learning_rate": 4.674846495864947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3047 }, { "completion_length": 494.5, "epoch": 0.844789356984479, "grad_norm": 0.416795551776886, "kl": 0.12820276618003845, "learning_rate": 4.674630561182803e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3048 }, { "completion_length": 440.5, "epoch": 0.8450665188470067, "grad_norm": 0.46694764494895935, "kl": 0.20868046581745148, "learning_rate": 4.674414559813783e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3049 }, { "completion_length": 461.25, "epoch": 0.8453436807095344, "grad_norm": 0.44291940331459045, "kl": 0.14326180517673492, "learning_rate": 4.6741984917645165e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3050 }, { "completion_length": 450.5, "epoch": 0.845620842572062, "grad_norm": 0.0, "kl": 0.11935944110155106, "learning_rate": 4.673982357041625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3051 }, { "completion_length": 486.75, "epoch": 0.8458980044345898, "grad_norm": 0.3783363401889801, "kl": 0.14554360508918762, "learning_rate": 4.673766155651739e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3052 }, { "completion_length": 447.75, "epoch": 0.8461751662971175, "grad_norm": 0.0, "kl": 0.159366175532341, "learning_rate": 4.673549887601486e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3053 }, { "completion_length": 393.75, "epoch": 0.8464523281596452, "grad_norm": 0.6029150485992432, "kl": 0.16507978737354279, "learning_rate": 4.673333552897501e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3054 }, { "completion_length": 465.5, "epoch": 0.8467294900221729, "grad_norm": 0.0, "kl": 0.12178364396095276, "learning_rate": 4.673117151546417e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3055 }, { "completion_length": 544.5, "epoch": 0.8470066518847007, "grad_norm": 0.0, "kl": 0.13034851849079132, "learning_rate": 4.67290068355487e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3056 }, { "completion_length": 425.0, "epoch": 0.8472838137472284, "grad_norm": 0.0, "kl": 0.24492624402046204, "learning_rate": 4.672684148929497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3057 }, { "completion_length": 494.75, "epoch": 0.8475609756097561, "grad_norm": 0.0, "kl": 0.1282890886068344, "learning_rate": 4.67246754767694e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3058 }, { "completion_length": 448.0, "epoch": 0.8478381374722838, "grad_norm": 0.0, "kl": 0.13512937724590302, "learning_rate": 4.672250879803841e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3059 }, { "completion_length": 436.75, "epoch": 0.8481152993348116, "grad_norm": 0.0, "kl": 0.13830628991127014, "learning_rate": 4.672034145316844e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3060 }, { "completion_length": 432.0, "epoch": 0.8483924611973392, "grad_norm": 0.461971640586853, "kl": 0.26281946897506714, "learning_rate": 4.671817344222594e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3061 }, { "completion_length": 372.75, "epoch": 0.8486696230598669, "grad_norm": 0.0, "kl": 0.17154935002326965, "learning_rate": 4.671600476527741e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3062 }, { "completion_length": 509.25, "epoch": 0.8489467849223947, "grad_norm": 0.0, "kl": 0.1762281060218811, "learning_rate": 4.671383542238935e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3063 }, { "completion_length": 470.75, "epoch": 0.8492239467849224, "grad_norm": 0.0, "kl": 0.1434790939092636, "learning_rate": 4.671166541362829e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3064 }, { "completion_length": 495.75, "epoch": 0.8495011086474501, "grad_norm": 0.3960655629634857, "kl": 0.22821146249771118, "learning_rate": 4.670949473906077e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3065 }, { "completion_length": 463.75, "epoch": 0.8497782705099778, "grad_norm": 0.4057968854904175, "kl": 0.13974729180335999, "learning_rate": 4.6707323398753346e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3066 }, { "completion_length": 448.25, "epoch": 0.8500554323725056, "grad_norm": 0.4064670503139496, "kl": 0.136294424533844, "learning_rate": 4.670515139277262e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3067 }, { "completion_length": 378.75, "epoch": 0.8503325942350333, "grad_norm": 0.5408063530921936, "kl": 0.13633625209331512, "learning_rate": 4.6702978721185184e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3068 }, { "completion_length": 468.25, "epoch": 0.850609756097561, "grad_norm": 0.42859894037246704, "kl": 0.11031585186719894, "learning_rate": 4.670080538405769e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3069 }, { "completion_length": 411.75, "epoch": 0.8508869179600886, "grad_norm": 0.0, "kl": 0.17297635972499847, "learning_rate": 4.669863138145676e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3070 }, { "completion_length": 452.75, "epoch": 0.8511640798226164, "grad_norm": 0.4488307237625122, "kl": 0.11973532289266586, "learning_rate": 4.669645671344907e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3071 }, { "completion_length": 475.75, "epoch": 0.8514412416851441, "grad_norm": 0.4228825867176056, "kl": 0.1307695060968399, "learning_rate": 4.6694281380101304e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3072 }, { "completion_length": 491.0, "epoch": 0.8517184035476718, "grad_norm": 0.42453140020370483, "kl": 0.1678740680217743, "learning_rate": 4.669210538148017e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3073 }, { "completion_length": 444.5, "epoch": 0.8519955654101996, "grad_norm": 0.0, "kl": 0.11557832360267639, "learning_rate": 4.668992871765241e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3074 }, { "completion_length": 472.5, "epoch": 0.8522727272727273, "grad_norm": 0.35209134221076965, "kl": 0.14426018297672272, "learning_rate": 4.668775138868476e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3075 }, { "completion_length": 453.75, "epoch": 0.852549889135255, "grad_norm": 0.4677203893661499, "kl": 0.12876610457897186, "learning_rate": 4.668557339464399e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3076 }, { "completion_length": 445.25, "epoch": 0.8528270509977827, "grad_norm": 0.0, "kl": 0.15374886989593506, "learning_rate": 4.668339473559691e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3077 }, { "completion_length": 454.25, "epoch": 0.8531042128603105, "grad_norm": 0.4098805785179138, "kl": 0.13645750284194946, "learning_rate": 4.668121541161029e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3078 }, { "completion_length": 453.0, "epoch": 0.8533813747228381, "grad_norm": 0.3747955560684204, "kl": 0.11644161492586136, "learning_rate": 4.6679035422751e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3079 }, { "completion_length": 412.25, "epoch": 0.8536585365853658, "grad_norm": 0.49329590797424316, "kl": 0.3305604159832001, "learning_rate": 4.667685476908588e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3080 }, { "completion_length": 462.25, "epoch": 0.8539356984478935, "grad_norm": 0.0, "kl": 0.1632811427116394, "learning_rate": 4.667467345068179e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3081 }, { "completion_length": 454.5, "epoch": 0.8542128603104213, "grad_norm": 0.0, "kl": 0.14908680319786072, "learning_rate": 4.667249146760563e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3082 }, { "completion_length": 439.5, "epoch": 0.854490022172949, "grad_norm": 0.0, "kl": 0.11918579041957855, "learning_rate": 4.667030881992431e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3083 }, { "completion_length": 494.5, "epoch": 0.8547671840354767, "grad_norm": 0.3841082453727722, "kl": 0.292849063873291, "learning_rate": 4.666812550770476e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3084 }, { "completion_length": 462.5, "epoch": 0.8550443458980045, "grad_norm": 0.0, "kl": 0.10973047465085983, "learning_rate": 4.6665941531013956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3085 }, { "completion_length": 457.75, "epoch": 0.8553215077605322, "grad_norm": 0.0, "kl": 0.1583152860403061, "learning_rate": 4.666375688991885e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3086 }, { "completion_length": 545.25, "epoch": 0.8555986696230599, "grad_norm": 0.35310736298561096, "kl": 0.16359688341617584, "learning_rate": 4.666157158448642e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3087 }, { "completion_length": 553.0, "epoch": 0.8558758314855875, "grad_norm": 0.0, "kl": 0.1144205778837204, "learning_rate": 4.665938561478371e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3088 }, { "completion_length": 490.0, "epoch": 0.8561529933481153, "grad_norm": 0.5038573145866394, "kl": 0.12293829023838043, "learning_rate": 4.665719898087775e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3089 }, { "completion_length": 452.5, "epoch": 0.856430155210643, "grad_norm": 0.38289570808410645, "kl": 0.10768940299749374, "learning_rate": 4.665501168283558e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3090 }, { "completion_length": 463.25, "epoch": 0.8567073170731707, "grad_norm": 0.37819012999534607, "kl": 0.1340055912733078, "learning_rate": 4.665282372072429e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3091 }, { "completion_length": 434.0, "epoch": 0.8569844789356984, "grad_norm": 0.4245779514312744, "kl": 0.12280475348234177, "learning_rate": 4.665063509461098e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3092 }, { "completion_length": 504.75, "epoch": 0.8572616407982262, "grad_norm": 0.0, "kl": 0.1136208325624466, "learning_rate": 4.664844580456274e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3093 }, { "completion_length": 493.0, "epoch": 0.8575388026607539, "grad_norm": 0.4768701195716858, "kl": 0.10465104132890701, "learning_rate": 4.664625585064672e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3094 }, { "completion_length": 457.0, "epoch": 0.8578159645232816, "grad_norm": 0.40956446528434753, "kl": 0.11875496804714203, "learning_rate": 4.664406523293009e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3095 }, { "completion_length": 403.25, "epoch": 0.8580931263858093, "grad_norm": 0.6500886082649231, "kl": 0.14376510679721832, "learning_rate": 4.664187395148001e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3096 }, { "completion_length": 467.25, "epoch": 0.858370288248337, "grad_norm": 0.41306638717651367, "kl": 0.13261756300926208, "learning_rate": 4.663968200636369e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3097 }, { "completion_length": 470.5, "epoch": 0.8586474501108647, "grad_norm": 0.0, "kl": 0.1239800900220871, "learning_rate": 4.663748939764834e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3098 }, { "completion_length": 472.75, "epoch": 0.8589246119733924, "grad_norm": 0.0, "kl": 0.13036830723285675, "learning_rate": 4.66352961254012e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3099 }, { "completion_length": 391.5, "epoch": 0.8592017738359202, "grad_norm": 0.0, "kl": 0.1385686844587326, "learning_rate": 4.663310218968952e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3100 }, { "completion_length": 398.5, "epoch": 0.8594789356984479, "grad_norm": 0.4299643635749817, "kl": 0.12922580540180206, "learning_rate": 4.66309075905806e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3101 }, { "completion_length": 482.5, "epoch": 0.8597560975609756, "grad_norm": 0.0, "kl": 0.16811232268810272, "learning_rate": 4.662871232814171e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3102 }, { "completion_length": 471.25, "epoch": 0.8600332594235033, "grad_norm": 0.5509511828422546, "kl": 0.18372398614883423, "learning_rate": 4.66265164024402e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3103 }, { "completion_length": 421.25, "epoch": 0.8603104212860311, "grad_norm": 0.4299129545688629, "kl": 0.11029158532619476, "learning_rate": 4.66243198135434e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3104 }, { "completion_length": 496.75, "epoch": 0.8605875831485588, "grad_norm": 0.41753950715065, "kl": 0.10481246560811996, "learning_rate": 4.662212256151865e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3105 }, { "completion_length": 495.25, "epoch": 0.8608647450110865, "grad_norm": 0.0, "kl": 0.10440144687891006, "learning_rate": 4.661992464643335e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3106 }, { "completion_length": 457.0, "epoch": 0.8611419068736141, "grad_norm": 0.0, "kl": 0.1202608048915863, "learning_rate": 4.661772606835491e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3107 }, { "completion_length": 503.5, "epoch": 0.8614190687361419, "grad_norm": 0.37009525299072266, "kl": 0.1086554303765297, "learning_rate": 4.6615526827350735e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3108 }, { "completion_length": 461.5, "epoch": 0.8616962305986696, "grad_norm": 0.0, "kl": 0.2792345881462097, "learning_rate": 4.661332692348827e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3109 }, { "completion_length": 481.75, "epoch": 0.8619733924611973, "grad_norm": 0.3634195029735565, "kl": 0.21477444469928741, "learning_rate": 4.6611126356834975e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3110 }, { "completion_length": 422.75, "epoch": 0.8622505543237251, "grad_norm": 0.0, "kl": 0.1274677813053131, "learning_rate": 4.660892512745834e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3111 }, { "completion_length": 487.75, "epoch": 0.8625277161862528, "grad_norm": 0.0, "kl": 0.12220550328493118, "learning_rate": 4.6606723235425845e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3112 }, { "completion_length": 472.5, "epoch": 0.8628048780487805, "grad_norm": 0.0, "kl": 0.1272049993276596, "learning_rate": 4.660452068080505e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3113 }, { "completion_length": 474.75, "epoch": 0.8630820399113082, "grad_norm": 0.0, "kl": 0.1303606480360031, "learning_rate": 4.660231746366347e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3114 }, { "completion_length": 448.0, "epoch": 0.863359201773836, "grad_norm": 0.3889297544956207, "kl": 0.1312289535999298, "learning_rate": 4.660011358406867e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3115 }, { "completion_length": 452.0, "epoch": 0.8636363636363636, "grad_norm": 0.0, "kl": 0.1309083253145218, "learning_rate": 4.659790904208825e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3116 }, { "completion_length": 514.25, "epoch": 0.8639135254988913, "grad_norm": 0.4392390847206116, "kl": 0.096250019967556, "learning_rate": 4.659570383778981e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3117 }, { "completion_length": 466.0, "epoch": 0.864190687361419, "grad_norm": 0.0, "kl": 0.12575611472129822, "learning_rate": 4.659349797124096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3118 }, { "completion_length": 478.25, "epoch": 0.8644678492239468, "grad_norm": 0.4133940041065216, "kl": 0.11762803792953491, "learning_rate": 4.659129144250936e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3119 }, { "completion_length": 471.0, "epoch": 0.8647450110864745, "grad_norm": 0.0, "kl": 0.10929613560438156, "learning_rate": 4.658908425166266e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3120 }, { "completion_length": 416.5, "epoch": 0.8650221729490022, "grad_norm": 0.0, "kl": 0.12186572700738907, "learning_rate": 4.658687639876856e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3121 }, { "completion_length": 461.0, "epoch": 0.86529933481153, "grad_norm": 0.45658570528030396, "kl": 0.11690628528594971, "learning_rate": 4.6584667883894765e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3122 }, { "completion_length": 379.5, "epoch": 0.8655764966740577, "grad_norm": 0.0, "kl": 0.15068545937538147, "learning_rate": 4.658245870710898e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3123 }, { "completion_length": 450.75, "epoch": 0.8658536585365854, "grad_norm": 0.0, "kl": 0.11419569700956345, "learning_rate": 4.658024886847899e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3124 }, { "completion_length": 509.25, "epoch": 0.866130820399113, "grad_norm": 0.47910964488983154, "kl": 0.11164230108261108, "learning_rate": 4.657803836807252e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3125 }, { "completion_length": 512.25, "epoch": 0.8664079822616408, "grad_norm": 0.0, "kl": 0.10694241523742676, "learning_rate": 4.657582720595738e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3126 }, { "completion_length": 447.0, "epoch": 0.8666851441241685, "grad_norm": 0.4329245984554291, "kl": 0.1048135831952095, "learning_rate": 4.657361538220138e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3127 }, { "completion_length": 448.0, "epoch": 0.8669623059866962, "grad_norm": 0.4920947551727295, "kl": 0.5115119814872742, "learning_rate": 4.657140289687233e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3128 }, { "completion_length": 422.75, "epoch": 0.8672394678492239, "grad_norm": 0.5116263031959534, "kl": 0.15101079642772675, "learning_rate": 4.656918975003809e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3129 }, { "completion_length": 506.5, "epoch": 0.8675166297117517, "grad_norm": 0.314103901386261, "kl": 0.1052575409412384, "learning_rate": 4.656697594176653e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3130 }, { "completion_length": 429.0, "epoch": 0.8677937915742794, "grad_norm": 0.0, "kl": 0.1507520228624344, "learning_rate": 4.656476147212554e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3131 }, { "completion_length": 407.5, "epoch": 0.8680709534368071, "grad_norm": 0.42956846952438354, "kl": 0.11897794157266617, "learning_rate": 4.656254634118301e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3132 }, { "completion_length": 407.25, "epoch": 0.8683481152993349, "grad_norm": 0.454771488904953, "kl": 0.13066385686397552, "learning_rate": 4.65603305490069e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3133 }, { "completion_length": 470.0, "epoch": 0.8686252771618626, "grad_norm": 0.4439992308616638, "kl": 0.13834065198898315, "learning_rate": 4.655811409566512e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3134 }, { "completion_length": 468.0, "epoch": 0.8689024390243902, "grad_norm": 0.3790733814239502, "kl": 0.13531439006328583, "learning_rate": 4.655589698122567e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3135 }, { "completion_length": 443.25, "epoch": 0.8691796008869179, "grad_norm": 0.0, "kl": 0.10743766278028488, "learning_rate": 4.655367920575653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3136 }, { "completion_length": 404.5, "epoch": 0.8694567627494457, "grad_norm": 0.0, "kl": 0.11855556070804596, "learning_rate": 4.655146076932571e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3137 }, { "completion_length": 457.0, "epoch": 0.8697339246119734, "grad_norm": 0.0, "kl": 0.10656161606311798, "learning_rate": 4.654924167200124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3138 }, { "completion_length": 524.5, "epoch": 0.8700110864745011, "grad_norm": 0.0, "kl": 0.11981703341007233, "learning_rate": 4.654702191385117e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3139 }, { "completion_length": 444.25, "epoch": 0.8702882483370288, "grad_norm": 0.0, "kl": 0.14099843800067902, "learning_rate": 4.654480149494356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3140 }, { "completion_length": 456.0, "epoch": 0.8705654101995566, "grad_norm": 0.39834487438201904, "kl": 0.12665411829948425, "learning_rate": 4.6542580415346525e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3141 }, { "completion_length": 475.25, "epoch": 0.8708425720620843, "grad_norm": 0.0, "kl": 0.11668084561824799, "learning_rate": 4.654035867512816e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3142 }, { "completion_length": 424.5, "epoch": 0.871119733924612, "grad_norm": 0.4170789122581482, "kl": 0.14450421929359436, "learning_rate": 4.65381362743566e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3143 }, { "completion_length": 506.75, "epoch": 0.8713968957871396, "grad_norm": 0.0, "kl": 0.10767882317304611, "learning_rate": 4.6535913213100005e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3144 }, { "completion_length": 386.75, "epoch": 0.8716740576496674, "grad_norm": 0.0, "kl": 0.3009903132915497, "learning_rate": 4.653368949142653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3145 }, { "completion_length": 476.0, "epoch": 0.8719512195121951, "grad_norm": 0.0, "kl": 0.12079958617687225, "learning_rate": 4.6531465109404386e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3146 }, { "completion_length": 393.0, "epoch": 0.8722283813747228, "grad_norm": 0.0, "kl": 0.12910068035125732, "learning_rate": 4.652924006710177e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3147 }, { "completion_length": 423.5, "epoch": 0.8725055432372506, "grad_norm": 0.0, "kl": 0.13460808992385864, "learning_rate": 4.652701436458691e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3148 }, { "completion_length": 482.25, "epoch": 0.8727827050997783, "grad_norm": 0.0, "kl": 0.11139347404241562, "learning_rate": 4.652478800192808e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3149 }, { "completion_length": 446.25, "epoch": 0.873059866962306, "grad_norm": 0.0, "kl": 0.13744018971920013, "learning_rate": 4.652256097919354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3150 }, { "completion_length": 451.5, "epoch": 0.8733370288248337, "grad_norm": 0.39506444334983826, "kl": 0.11394532024860382, "learning_rate": 4.652033329645159e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3151 }, { "completion_length": 513.0, "epoch": 0.8736141906873615, "grad_norm": 0.406160831451416, "kl": 0.12258463352918625, "learning_rate": 4.651810495377054e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3152 }, { "completion_length": 472.25, "epoch": 0.8738913525498891, "grad_norm": 0.0, "kl": 0.20292311906814575, "learning_rate": 4.651587595121872e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3153 }, { "completion_length": 465.75, "epoch": 0.8741685144124168, "grad_norm": 0.3993099331855774, "kl": 0.13077521324157715, "learning_rate": 4.651364628886449e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3154 }, { "completion_length": 446.25, "epoch": 0.8744456762749445, "grad_norm": 0.37086308002471924, "kl": 0.12602248787879944, "learning_rate": 4.651141596677622e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3155 }, { "completion_length": 423.75, "epoch": 0.8747228381374723, "grad_norm": 0.0, "kl": 0.16285131871700287, "learning_rate": 4.650918498502232e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3156 }, { "completion_length": 443.5, "epoch": 0.875, "grad_norm": 0.0, "kl": 0.12615500390529633, "learning_rate": 4.650695334367118e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3157 }, { "completion_length": 417.5, "epoch": 0.8752771618625277, "grad_norm": 0.0, "kl": 0.1258571743965149, "learning_rate": 4.650472104279125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3158 }, { "completion_length": 450.25, "epoch": 0.8755543237250555, "grad_norm": 0.3740415573120117, "kl": 0.14597207307815552, "learning_rate": 4.650248808245098e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3159 }, { "completion_length": 437.25, "epoch": 0.8758314855875832, "grad_norm": 0.44930997490882874, "kl": 0.9162147045135498, "learning_rate": 4.6500254462718854e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3160 }, { "completion_length": 395.5, "epoch": 0.8761086474501109, "grad_norm": 0.0, "kl": 0.1818874031305313, "learning_rate": 4.649802018366336e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3161 }, { "completion_length": 499.25, "epoch": 0.8763858093126385, "grad_norm": 0.3950289487838745, "kl": 0.12388917058706284, "learning_rate": 4.649578524535302e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3162 }, { "completion_length": 459.25, "epoch": 0.8766629711751663, "grad_norm": 0.0, "kl": 0.11226087063550949, "learning_rate": 4.649354964785636e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3163 }, { "completion_length": 447.75, "epoch": 0.876940133037694, "grad_norm": 0.0, "kl": 0.15919674932956696, "learning_rate": 4.649131339124196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3164 }, { "completion_length": 410.25, "epoch": 0.8772172949002217, "grad_norm": 0.48341676592826843, "kl": 0.1455000340938568, "learning_rate": 4.648907647557836e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3165 }, { "completion_length": 503.0, "epoch": 0.8774944567627494, "grad_norm": 0.0, "kl": 0.1154170036315918, "learning_rate": 4.648683890093418e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3166 }, { "completion_length": 432.5, "epoch": 0.8777716186252772, "grad_norm": 0.0, "kl": 0.3281063735485077, "learning_rate": 4.648460066737804e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3167 }, { "completion_length": 518.75, "epoch": 0.8780487804878049, "grad_norm": 0.3854646384716034, "kl": 0.17733237147331238, "learning_rate": 4.648236177497857e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3168 }, { "completion_length": 423.0, "epoch": 0.8783259423503326, "grad_norm": 0.4437912404537201, "kl": 0.13397851586341858, "learning_rate": 4.648012222380443e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3169 }, { "completion_length": 439.75, "epoch": 0.8786031042128604, "grad_norm": 0.0, "kl": 0.18189844489097595, "learning_rate": 4.647788201392429e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3170 }, { "completion_length": 453.5, "epoch": 0.878880266075388, "grad_norm": 0.0, "kl": 0.14750781655311584, "learning_rate": 4.647564114540686e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3171 }, { "completion_length": 417.5, "epoch": 0.8791574279379157, "grad_norm": 0.0, "kl": 0.2020939290523529, "learning_rate": 4.647339961832086e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3172 }, { "completion_length": 479.25, "epoch": 0.8794345898004434, "grad_norm": 0.0, "kl": 0.5431508421897888, "learning_rate": 4.647115743273501e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3173 }, { "completion_length": 462.0, "epoch": 0.8797117516629712, "grad_norm": 0.43340280652046204, "kl": 0.2231823056936264, "learning_rate": 4.646891458871808e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3174 }, { "completion_length": 420.5, "epoch": 0.8799889135254989, "grad_norm": 0.5403679013252258, "kl": 0.18521477282047272, "learning_rate": 4.646667108633885e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3175 }, { "completion_length": 387.5, "epoch": 0.8802660753880266, "grad_norm": 0.0, "kl": 0.14532805979251862, "learning_rate": 4.646442692566612e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3176 }, { "completion_length": 411.75, "epoch": 0.8805432372505543, "grad_norm": 0.45348554849624634, "kl": 0.14953534305095673, "learning_rate": 4.64621821067687e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3177 }, { "completion_length": 497.75, "epoch": 0.8808203991130821, "grad_norm": 0.3751640021800995, "kl": 0.1268509179353714, "learning_rate": 4.645993662971544e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3178 }, { "completion_length": 472.25, "epoch": 0.8810975609756098, "grad_norm": 0.42813363671302795, "kl": 0.13836048543453217, "learning_rate": 4.645769049457519e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3179 }, { "completion_length": 456.25, "epoch": 0.8813747228381374, "grad_norm": 0.0, "kl": 0.15897513926029205, "learning_rate": 4.645544370141684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3180 }, { "completion_length": 477.0, "epoch": 0.8816518847006651, "grad_norm": 0.0, "kl": 0.14038793742656708, "learning_rate": 4.645319625030928e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3181 }, { "completion_length": 430.5, "epoch": 0.8819290465631929, "grad_norm": 0.0, "kl": 0.17030879855155945, "learning_rate": 4.645094814132144e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3182 }, { "completion_length": 464.75, "epoch": 0.8822062084257206, "grad_norm": 0.41684433817863464, "kl": 0.15903612971305847, "learning_rate": 4.644869937452224e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3183 }, { "completion_length": 491.5, "epoch": 0.8824833702882483, "grad_norm": 0.0, "kl": 0.152840256690979, "learning_rate": 4.6446449949980665e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3184 }, { "completion_length": 378.5, "epoch": 0.8827605321507761, "grad_norm": 0.0, "kl": 0.1687302589416504, "learning_rate": 4.6444199867765685e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3185 }, { "completion_length": 434.5, "epoch": 0.8830376940133038, "grad_norm": 0.45704764127731323, "kl": 0.13266631960868835, "learning_rate": 4.644194912794631e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3186 }, { "completion_length": 432.75, "epoch": 0.8833148558758315, "grad_norm": 0.0, "kl": 0.1602134108543396, "learning_rate": 4.643969773059154e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3187 }, { "completion_length": 414.25, "epoch": 0.8835920177383592, "grad_norm": 0.43510109186172485, "kl": 0.13890941441059113, "learning_rate": 4.643744567577042e-06, "loss": 0.0, "reward": 2.5, "reward_std": 2.217355728149414, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3188 }, { "completion_length": 483.0, "epoch": 0.883869179600887, "grad_norm": 0.0, "kl": 0.16288302838802338, "learning_rate": 4.6435192963552035e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3189 }, { "completion_length": 421.0, "epoch": 0.8841463414634146, "grad_norm": 0.0, "kl": 0.16068778932094574, "learning_rate": 4.643293959400544e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3190 }, { "completion_length": 467.25, "epoch": 0.8844235033259423, "grad_norm": 0.4432770609855652, "kl": 0.15590721368789673, "learning_rate": 4.643068556719975e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3191 }, { "completion_length": 442.5, "epoch": 0.88470066518847, "grad_norm": 0.5245303511619568, "kl": 0.14422467350959778, "learning_rate": 4.642843088320408e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3192 }, { "completion_length": 479.0, "epoch": 0.8849778270509978, "grad_norm": 0.0, "kl": 0.1163695901632309, "learning_rate": 4.642617554208758e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3193 }, { "completion_length": 437.25, "epoch": 0.8852549889135255, "grad_norm": 0.0, "kl": 0.2508760988712311, "learning_rate": 4.642391954391941e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3194 }, { "completion_length": 448.0, "epoch": 0.8855321507760532, "grad_norm": 0.0, "kl": 0.1453254371881485, "learning_rate": 4.642166288876874e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3195 }, { "completion_length": 420.5, "epoch": 0.885809312638581, "grad_norm": 0.0, "kl": 0.14194810390472412, "learning_rate": 4.641940557670478e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3196 }, { "completion_length": 478.5, "epoch": 0.8860864745011087, "grad_norm": 0.43045762181282043, "kl": 0.13117288053035736, "learning_rate": 4.641714760779676e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3197 }, { "completion_length": 500.0, "epoch": 0.8863636363636364, "grad_norm": 0.3720889687538147, "kl": 0.1444433480501175, "learning_rate": 4.641488898211391e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3198 }, { "completion_length": 447.25, "epoch": 0.886640798226164, "grad_norm": 0.0, "kl": 0.14565129578113556, "learning_rate": 4.6412629699725494e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3199 }, { "completion_length": 477.0, "epoch": 0.8869179600886918, "grad_norm": 0.0, "kl": 0.13766734302043915, "learning_rate": 4.641036976070081e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3200 }, { "completion_length": 522.0, "epoch": 0.8871951219512195, "grad_norm": 0.4450347125530243, "kl": 1388.575439453125, "learning_rate": 4.640810916510915e-06, "loss": -0.0, "reward": 1.9375, "reward_std": 0.625, "rewards/confident_score_func": 0.375, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 3201 }, { "completion_length": 493.5, "epoch": 0.8874722838137472, "grad_norm": 0.0, "kl": 0.13240210711956024, "learning_rate": 4.640584791301984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3202 }, { "completion_length": 457.5, "epoch": 0.8877494456762749, "grad_norm": 0.0, "kl": 0.1623697727918625, "learning_rate": 4.640358600450221e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3203 }, { "completion_length": 459.5, "epoch": 0.8880266075388027, "grad_norm": 0.47249624133110046, "kl": 0.1484699845314026, "learning_rate": 4.640132343962565e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3204 }, { "completion_length": 437.5, "epoch": 0.8883037694013304, "grad_norm": 0.47675129771232605, "kl": 0.1314195692539215, "learning_rate": 4.639906021845951e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3205 }, { "completion_length": 439.5, "epoch": 0.8885809312638581, "grad_norm": 0.0, "kl": 0.17726056277751923, "learning_rate": 4.639679634107322e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3206 }, { "completion_length": 438.75, "epoch": 0.8888580931263859, "grad_norm": 0.0, "kl": 0.13012845814228058, "learning_rate": 4.639453180753619e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3207 }, { "completion_length": 479.5, "epoch": 0.8891352549889135, "grad_norm": 0.0, "kl": 0.15102069079875946, "learning_rate": 4.639226661791787e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3208 }, { "completion_length": 436.5, "epoch": 0.8894124168514412, "grad_norm": 0.0, "kl": 0.15806648135185242, "learning_rate": 4.639000077228773e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3209 }, { "completion_length": 435.25, "epoch": 0.8896895787139689, "grad_norm": 0.42377710342407227, "kl": 0.1682024449110031, "learning_rate": 4.638773427071523e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3210 }, { "completion_length": 559.0, "epoch": 0.8899667405764967, "grad_norm": 0.0, "kl": 0.11599278450012207, "learning_rate": 4.6385467113269914e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3211 }, { "completion_length": 368.0, "epoch": 0.8902439024390244, "grad_norm": 0.0, "kl": 0.15559262037277222, "learning_rate": 4.638319930002126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3212 }, { "completion_length": 461.75, "epoch": 0.8905210643015521, "grad_norm": 0.4398881793022156, "kl": 0.13891583681106567, "learning_rate": 4.638093083103885e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3213 }, { "completion_length": 435.0, "epoch": 0.8907982261640798, "grad_norm": 0.46924126148223877, "kl": 0.11593890935182571, "learning_rate": 4.637866170639223e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3214 }, { "completion_length": 464.25, "epoch": 0.8910753880266076, "grad_norm": 0.0, "kl": 0.12160845845937729, "learning_rate": 4.6376391926150985e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3215 }, { "completion_length": 493.5, "epoch": 0.8913525498891353, "grad_norm": 0.38886523246765137, "kl": 0.5389755368232727, "learning_rate": 4.637412149038473e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3216 }, { "completion_length": 483.25, "epoch": 0.891629711751663, "grad_norm": 0.4052051603794098, "kl": 0.15127818286418915, "learning_rate": 4.637185039916307e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3217 }, { "completion_length": 424.5, "epoch": 0.8919068736141907, "grad_norm": 0.0, "kl": 0.1576191633939743, "learning_rate": 4.636957865255568e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3218 }, { "completion_length": 591.75, "epoch": 0.8921840354767184, "grad_norm": 0.0, "kl": 0.1508377343416214, "learning_rate": 4.63673062506322e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3219 }, { "completion_length": 406.75, "epoch": 0.8924611973392461, "grad_norm": 0.41519278287887573, "kl": 0.13936330378055573, "learning_rate": 4.636503319346233e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3220 }, { "completion_length": 497.75, "epoch": 0.8927383592017738, "grad_norm": 0.43453919887542725, "kl": 0.12949658930301666, "learning_rate": 4.636275948111575e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3221 }, { "completion_length": 519.0, "epoch": 0.8930155210643016, "grad_norm": 0.0, "kl": 0.12403556704521179, "learning_rate": 4.636048511366222e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3222 }, { "completion_length": 471.0, "epoch": 0.8932926829268293, "grad_norm": 0.0, "kl": 0.1628510057926178, "learning_rate": 4.635821009117146e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3223 }, { "completion_length": 450.0, "epoch": 0.893569844789357, "grad_norm": 0.0, "kl": 0.15527425706386566, "learning_rate": 4.635593441371325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3224 }, { "completion_length": 534.0, "epoch": 0.8938470066518847, "grad_norm": 0.0, "kl": 0.13182607293128967, "learning_rate": 4.635365808135736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3225 }, { "completion_length": 485.0, "epoch": 0.8941241685144125, "grad_norm": 0.0, "kl": 0.12351148575544357, "learning_rate": 4.635138109417362e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3226 }, { "completion_length": 433.5, "epoch": 0.8944013303769401, "grad_norm": 0.44896939396858215, "kl": 0.1560157686471939, "learning_rate": 4.634910345223184e-06, "loss": -0.0, "reward": 5.0, "reward_std": 1.5, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3227 }, { "completion_length": 433.25, "epoch": 0.8946784922394678, "grad_norm": 0.0, "kl": 0.1666184365749359, "learning_rate": 4.634682515560186e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3228 }, { "completion_length": 456.0, "epoch": 0.8949556541019955, "grad_norm": 0.0, "kl": 0.12423605471849442, "learning_rate": 4.634454620435356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3229 }, { "completion_length": 407.5, "epoch": 0.8952328159645233, "grad_norm": 0.40468788146972656, "kl": 0.15627561509609222, "learning_rate": 4.634226659855681e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3230 }, { "completion_length": 447.75, "epoch": 0.895509977827051, "grad_norm": 0.4396636188030243, "kl": 0.15252991020679474, "learning_rate": 4.633998633828153e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3231 }, { "completion_length": 525.5, "epoch": 0.8957871396895787, "grad_norm": 0.0, "kl": 0.1178009957075119, "learning_rate": 4.633770542359764e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3232 }, { "completion_length": 478.0, "epoch": 0.8960643015521065, "grad_norm": 0.44009172916412354, "kl": 0.136846661567688, "learning_rate": 4.633542385457509e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3233 }, { "completion_length": 431.25, "epoch": 0.8963414634146342, "grad_norm": 0.39828795194625854, "kl": 0.14034220576286316, "learning_rate": 4.633314163128384e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3234 }, { "completion_length": 452.75, "epoch": 0.8966186252771619, "grad_norm": 0.0, "kl": 0.12958575785160065, "learning_rate": 4.633085875379388e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3235 }, { "completion_length": 464.75, "epoch": 0.8968957871396895, "grad_norm": 0.43974053859710693, "kl": 0.14455580711364746, "learning_rate": 4.632857522217522e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3236 }, { "completion_length": 464.5, "epoch": 0.8971729490022173, "grad_norm": 0.40157610177993774, "kl": 0.12906479835510254, "learning_rate": 4.632629103649788e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3237 }, { "completion_length": 456.5, "epoch": 0.897450110864745, "grad_norm": 0.0, "kl": 0.1324140727519989, "learning_rate": 4.63240061968319e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3238 }, { "completion_length": 439.25, "epoch": 0.8977272727272727, "grad_norm": 0.0, "kl": 0.14180472493171692, "learning_rate": 4.632172070324737e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3239 }, { "completion_length": 507.0, "epoch": 0.8980044345898004, "grad_norm": 0.48101750016212463, "kl": 0.11793673038482666, "learning_rate": 4.631943455581436e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3240 }, { "completion_length": 490.0, "epoch": 0.8982815964523282, "grad_norm": 0.0, "kl": 0.11457985639572144, "learning_rate": 4.631714775460297e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3241 }, { "completion_length": 447.25, "epoch": 0.8985587583148559, "grad_norm": 0.6387324929237366, "kl": 0.13202084600925446, "learning_rate": 4.631486029968335e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3242 }, { "completion_length": 502.75, "epoch": 0.8988359201773836, "grad_norm": 0.0, "kl": 0.12353473901748657, "learning_rate": 4.631257219112562e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3243 }, { "completion_length": 473.5, "epoch": 0.8991130820399114, "grad_norm": 0.3799285888671875, "kl": 0.12193720042705536, "learning_rate": 4.631028342899997e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3244 }, { "completion_length": 503.0, "epoch": 0.899390243902439, "grad_norm": 0.0, "kl": 0.14426113665103912, "learning_rate": 4.630799401337657e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3245 }, { "completion_length": 491.0, "epoch": 0.8996674057649667, "grad_norm": 0.4630395770072937, "kl": 0.11993616819381714, "learning_rate": 4.630570394432562e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3246 }, { "completion_length": 445.75, "epoch": 0.8999445676274944, "grad_norm": 0.0, "kl": 0.14702104032039642, "learning_rate": 4.630341322191738e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3247 }, { "completion_length": 522.75, "epoch": 0.9002217294900222, "grad_norm": 0.0, "kl": 0.13219504058361053, "learning_rate": 4.630112184622207e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3248 }, { "completion_length": 490.25, "epoch": 0.9004988913525499, "grad_norm": 0.35711032152175903, "kl": 0.1281079649925232, "learning_rate": 4.629882981730996e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3249 }, { "completion_length": 479.0, "epoch": 0.9007760532150776, "grad_norm": 0.0, "kl": 0.11999403685331345, "learning_rate": 4.629653713525133e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3250 }, { "completion_length": 572.5, "epoch": 0.9010532150776053, "grad_norm": 0.0, "kl": 0.1134670227766037, "learning_rate": 4.629424380011651e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3251 }, { "completion_length": 515.0, "epoch": 0.9013303769401331, "grad_norm": 0.0, "kl": 0.11919708549976349, "learning_rate": 4.6291949811975814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3252 }, { "completion_length": 443.75, "epoch": 0.9016075388026608, "grad_norm": 0.4297451078891754, "kl": 0.5156996846199036, "learning_rate": 4.628965517089959e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3253 }, { "completion_length": 467.0, "epoch": 0.9018847006651884, "grad_norm": 0.399029016494751, "kl": 0.1340237259864807, "learning_rate": 4.62873598769582e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3254 }, { "completion_length": 592.75, "epoch": 0.9021618625277162, "grad_norm": 0.3921700716018677, "kl": 0.11112895607948303, "learning_rate": 4.628506393022204e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3255 }, { "completion_length": 413.25, "epoch": 0.9024390243902439, "grad_norm": 0.0, "kl": 0.1270446479320526, "learning_rate": 4.628276733076151e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3256 }, { "completion_length": 462.25, "epoch": 0.9027161862527716, "grad_norm": 0.4044170379638672, "kl": 0.12007453292608261, "learning_rate": 4.628047007864704e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3257 }, { "completion_length": 497.5, "epoch": 0.9029933481152993, "grad_norm": 0.0, "kl": 0.2644863426685333, "learning_rate": 4.627817217394909e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3258 }, { "completion_length": 450.25, "epoch": 0.9032705099778271, "grad_norm": 0.0, "kl": 0.13380661606788635, "learning_rate": 4.62758736167381e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3259 }, { "completion_length": 460.75, "epoch": 0.9035476718403548, "grad_norm": 0.4237990975379944, "kl": 0.24704906344413757, "learning_rate": 4.627357440708457e-06, "loss": 0.0, "reward": 5.5625, "reward_std": 0.375, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 3260 }, { "completion_length": 455.5, "epoch": 0.9038248337028825, "grad_norm": 0.0, "kl": 0.13409125804901123, "learning_rate": 4.627127454505902e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3261 }, { "completion_length": 442.25, "epoch": 0.9041019955654102, "grad_norm": 0.37187886238098145, "kl": 0.14954005181789398, "learning_rate": 4.626897403073196e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3262 }, { "completion_length": 470.75, "epoch": 0.904379157427938, "grad_norm": 0.435589998960495, "kl": 0.11636219918727875, "learning_rate": 4.626667286417394e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3263 }, { "completion_length": 385.0, "epoch": 0.9046563192904656, "grad_norm": 0.0, "kl": 0.13664717972278595, "learning_rate": 4.626437104545554e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3264 }, { "completion_length": 541.0, "epoch": 0.9049334811529933, "grad_norm": 0.0, "kl": 0.13942070305347443, "learning_rate": 4.626206857464733e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3265 }, { "completion_length": 572.0, "epoch": 0.905210643015521, "grad_norm": 0.0, "kl": 0.10764807462692261, "learning_rate": 4.625976545181993e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3266 }, { "completion_length": 451.25, "epoch": 0.9054878048780488, "grad_norm": 0.0, "kl": 0.762687087059021, "learning_rate": 4.625746167704395e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3267 }, { "completion_length": 442.25, "epoch": 0.9057649667405765, "grad_norm": 0.3720291554927826, "kl": 0.15441957116127014, "learning_rate": 4.625515725039006e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3268 }, { "completion_length": 428.75, "epoch": 0.9060421286031042, "grad_norm": 0.0, "kl": 0.13146723806858063, "learning_rate": 4.625285217192891e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3269 }, { "completion_length": 512.25, "epoch": 0.906319290465632, "grad_norm": 0.0, "kl": 0.11907433718442917, "learning_rate": 4.62505464417312e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3270 }, { "completion_length": 482.75, "epoch": 0.9065964523281597, "grad_norm": 0.0, "kl": 0.10422441363334656, "learning_rate": 4.624824005986764e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3271 }, { "completion_length": 528.0, "epoch": 0.9068736141906873, "grad_norm": 0.0, "kl": 0.13917388021945953, "learning_rate": 4.6245933026408924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3272 }, { "completion_length": 549.75, "epoch": 0.907150776053215, "grad_norm": 0.3385356366634369, "kl": 0.12447649985551834, "learning_rate": 4.6243625341425836e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3273 }, { "completion_length": 467.75, "epoch": 0.9074279379157428, "grad_norm": 0.0, "kl": 0.2697383165359497, "learning_rate": 4.624131700498913e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3274 }, { "completion_length": 440.75, "epoch": 0.9077050997782705, "grad_norm": 0.46781259775161743, "kl": 0.14514705538749695, "learning_rate": 4.62390080171696e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3275 }, { "completion_length": 474.0, "epoch": 0.9079822616407982, "grad_norm": 0.43381384015083313, "kl": 0.11685121059417725, "learning_rate": 4.623669837803803e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3276 }, { "completion_length": 448.75, "epoch": 0.9082594235033259, "grad_norm": 0.0, "kl": 0.16200484335422516, "learning_rate": 4.623438808766527e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3277 }, { "completion_length": 515.5, "epoch": 0.9085365853658537, "grad_norm": 0.3536342680454254, "kl": 0.10138336569070816, "learning_rate": 4.623207714612217e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3278 }, { "completion_length": 434.0, "epoch": 0.9088137472283814, "grad_norm": 0.4115983545780182, "kl": 0.1373562067747116, "learning_rate": 4.622976555347957e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3279 }, { "completion_length": 488.25, "epoch": 0.9090909090909091, "grad_norm": 0.4074711501598358, "kl": 0.13392916321754456, "learning_rate": 4.6227453309808384e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3280 }, { "completion_length": 519.75, "epoch": 0.9093680709534369, "grad_norm": 0.3894805610179901, "kl": 0.10492634773254395, "learning_rate": 4.622514041517951e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3281 }, { "completion_length": 463.25, "epoch": 0.9096452328159645, "grad_norm": 0.0, "kl": 0.126058429479599, "learning_rate": 4.622282686966387e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3282 }, { "completion_length": 404.0, "epoch": 0.9099223946784922, "grad_norm": 0.40008658170700073, "kl": 0.14238716661930084, "learning_rate": 4.622051267333242e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3283 }, { "completion_length": 433.5, "epoch": 0.9101995565410199, "grad_norm": 0.46730467677116394, "kl": 0.12023098021745682, "learning_rate": 4.621819782625612e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3284 }, { "completion_length": 465.0, "epoch": 0.9104767184035477, "grad_norm": 0.0, "kl": 0.11235227435827255, "learning_rate": 4.621588232850595e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3285 }, { "completion_length": 478.0, "epoch": 0.9107538802660754, "grad_norm": 0.42214637994766235, "kl": 0.1589764803647995, "learning_rate": 4.6213566180152935e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3286 }, { "completion_length": 459.0, "epoch": 0.9110310421286031, "grad_norm": 0.0, "kl": 0.15532533824443817, "learning_rate": 4.621124938126809e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3287 }, { "completion_length": 478.0, "epoch": 0.9113082039911308, "grad_norm": 0.0, "kl": 0.10095687955617905, "learning_rate": 4.620893193192245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3288 }, { "completion_length": 459.75, "epoch": 0.9115853658536586, "grad_norm": 0.0, "kl": 0.12722884118556976, "learning_rate": 4.6206613832187105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3289 }, { "completion_length": 464.75, "epoch": 0.9118625277161863, "grad_norm": 0.6495952606201172, "kl": 0.14270858466625214, "learning_rate": 4.620429508213314e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3290 }, { "completion_length": 436.0, "epoch": 0.9121396895787139, "grad_norm": 0.0, "kl": 0.12274645268917084, "learning_rate": 4.620197568183165e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3291 }, { "completion_length": 489.25, "epoch": 0.9124168514412417, "grad_norm": 0.0, "kl": 0.10852520912885666, "learning_rate": 4.619965563135375e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3292 }, { "completion_length": 460.5, "epoch": 0.9126940133037694, "grad_norm": 0.3872404396533966, "kl": 0.1289455145597458, "learning_rate": 4.619733493077061e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3293 }, { "completion_length": 446.0, "epoch": 0.9129711751662971, "grad_norm": 0.43323710560798645, "kl": 0.1436058133840561, "learning_rate": 4.619501358015338e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3294 }, { "completion_length": 453.0, "epoch": 0.9132483370288248, "grad_norm": 0.0, "kl": 0.11825098097324371, "learning_rate": 4.619269157957326e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3295 }, { "completion_length": 506.5, "epoch": 0.9135254988913526, "grad_norm": 0.38886475563049316, "kl": 0.11396484822034836, "learning_rate": 4.619036892910145e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3296 }, { "completion_length": 515.0, "epoch": 0.9138026607538803, "grad_norm": 0.0, "kl": 0.1853172332048416, "learning_rate": 4.618804562880917e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3297 }, { "completion_length": 472.75, "epoch": 0.914079822616408, "grad_norm": 0.0, "kl": 0.13050994277000427, "learning_rate": 4.618572167876767e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3298 }, { "completion_length": 506.0, "epoch": 0.9143569844789357, "grad_norm": 0.0, "kl": 0.18803708255290985, "learning_rate": 4.6183397079048205e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3299 }, { "completion_length": 478.5, "epoch": 0.9146341463414634, "grad_norm": 0.0, "kl": 0.1354266107082367, "learning_rate": 4.618107182972209e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3300 }, { "completion_length": 461.5, "epoch": 0.9149113082039911, "grad_norm": 0.382709801197052, "kl": 0.13840772211551666, "learning_rate": 4.617874593086061e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3301 }, { "completion_length": 427.5, "epoch": 0.9151884700665188, "grad_norm": 0.0, "kl": 0.11538863182067871, "learning_rate": 4.617641938253508e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3302 }, { "completion_length": 494.0, "epoch": 0.9154656319290465, "grad_norm": 0.0, "kl": 0.12402817606925964, "learning_rate": 4.617409218481688e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3303 }, { "completion_length": 491.5, "epoch": 0.9157427937915743, "grad_norm": 0.0, "kl": 0.1209549531340599, "learning_rate": 4.617176433777735e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3304 }, { "completion_length": 439.75, "epoch": 0.916019955654102, "grad_norm": 0.0, "kl": 0.13397984206676483, "learning_rate": 4.616943584148788e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3305 }, { "completion_length": 462.25, "epoch": 0.9162971175166297, "grad_norm": 0.0, "kl": 0.20207911729812622, "learning_rate": 4.616710669601986e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3306 }, { "completion_length": 473.5, "epoch": 0.9165742793791575, "grad_norm": 0.0, "kl": 0.11301174014806747, "learning_rate": 4.616477690144475e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3307 }, { "completion_length": 572.5, "epoch": 0.9168514412416852, "grad_norm": 0.0, "kl": 0.11748477071523666, "learning_rate": 4.616244645783396e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3308 }, { "completion_length": 489.5, "epoch": 0.9171286031042128, "grad_norm": 0.43407928943634033, "kl": 0.14133931696414948, "learning_rate": 4.616011536525898e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3309 }, { "completion_length": 439.75, "epoch": 0.9174057649667405, "grad_norm": 0.0, "kl": 0.13836508989334106, "learning_rate": 4.6157783623791285e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3310 }, { "completion_length": 456.5, "epoch": 0.9176829268292683, "grad_norm": 0.0, "kl": 0.09399433434009552, "learning_rate": 4.615545123350238e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3311 }, { "completion_length": 455.5, "epoch": 0.917960088691796, "grad_norm": 0.4256480038166046, "kl": 0.12225571274757385, "learning_rate": 4.615311819446379e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3312 }, { "completion_length": 477.0, "epoch": 0.9182372505543237, "grad_norm": 0.0, "kl": 0.16790932416915894, "learning_rate": 4.615078450674706e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3313 }, { "completion_length": 480.75, "epoch": 0.9185144124168514, "grad_norm": 0.4728969633579254, "kl": 0.10820623487234116, "learning_rate": 4.614845017042375e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3314 }, { "completion_length": 404.0, "epoch": 0.9187915742793792, "grad_norm": 0.4581604599952698, "kl": 6.2866740226745605, "learning_rate": 4.614611518556545e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3315 }, { "completion_length": 457.75, "epoch": 0.9190687361419069, "grad_norm": 0.0, "kl": 0.12912122905254364, "learning_rate": 4.6143779552243765e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3316 }, { "completion_length": 530.25, "epoch": 0.9193458980044346, "grad_norm": 0.364991694688797, "kl": 0.12879718840122223, "learning_rate": 4.614144327053032e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3317 }, { "completion_length": 477.25, "epoch": 0.9196230598669624, "grad_norm": 0.4497980773448944, "kl": 0.1094418317079544, "learning_rate": 4.613910634049675e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3318 }, { "completion_length": 487.0, "epoch": 0.91990022172949, "grad_norm": 0.0, "kl": 0.12126760929822922, "learning_rate": 4.613676876221474e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3319 }, { "completion_length": 436.25, "epoch": 0.9201773835920177, "grad_norm": 0.5138545632362366, "kl": 0.14674235880374908, "learning_rate": 4.6134430535755936e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3320 }, { "completion_length": 461.25, "epoch": 0.9204545454545454, "grad_norm": 0.4958151876926422, "kl": 0.20787152647972107, "learning_rate": 4.613209166119208e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3321 }, { "completion_length": 462.25, "epoch": 0.9207317073170732, "grad_norm": 0.0, "kl": 0.13111375272274017, "learning_rate": 4.612975213859487e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3322 }, { "completion_length": 487.25, "epoch": 0.9210088691796009, "grad_norm": 0.0, "kl": 0.1638839691877365, "learning_rate": 4.612741196803607e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3323 }, { "completion_length": 567.0, "epoch": 0.9212860310421286, "grad_norm": 0.3610539734363556, "kl": 0.13280102610588074, "learning_rate": 4.612507114958743e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3324 }, { "completion_length": 496.75, "epoch": 0.9215631929046563, "grad_norm": 0.36149242520332336, "kl": 0.10917819291353226, "learning_rate": 4.612272968332073e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3325 }, { "completion_length": 475.5, "epoch": 0.9218403547671841, "grad_norm": 0.0, "kl": 0.13457554578781128, "learning_rate": 4.612038756930778e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3326 }, { "completion_length": 472.5, "epoch": 0.9221175166297118, "grad_norm": 0.4654693603515625, "kl": 0.12438379973173141, "learning_rate": 4.611804480762041e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3327 }, { "completion_length": 467.5, "epoch": 0.9223946784922394, "grad_norm": 0.4622831344604492, "kl": 0.1464519500732422, "learning_rate": 4.6115701398330444e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3328 }, { "completion_length": 445.5, "epoch": 0.9226718403547672, "grad_norm": 0.0, "kl": 0.1200530156493187, "learning_rate": 4.6113357341509765e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3329 }, { "completion_length": 444.0, "epoch": 0.9229490022172949, "grad_norm": 0.0, "kl": 0.133739173412323, "learning_rate": 4.611101263723024e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3330 }, { "completion_length": 559.0, "epoch": 0.9232261640798226, "grad_norm": 0.3326874375343323, "kl": 0.11645563691854477, "learning_rate": 4.610866728556378e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3331 }, { "completion_length": 473.5, "epoch": 0.9235033259423503, "grad_norm": 0.0, "kl": 0.14157545566558838, "learning_rate": 4.61063212865823e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3332 }, { "completion_length": 463.25, "epoch": 0.9237804878048781, "grad_norm": 0.4743000566959381, "kl": 0.12633927166461945, "learning_rate": 4.610397464035775e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3333 }, { "completion_length": 451.25, "epoch": 0.9240576496674058, "grad_norm": 0.38163188099861145, "kl": 0.18631727993488312, "learning_rate": 4.610162734696209e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3334 }, { "completion_length": 488.75, "epoch": 0.9243348115299335, "grad_norm": 0.370322585105896, "kl": 0.13610944151878357, "learning_rate": 4.609927940646729e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3335 }, { "completion_length": 484.5, "epoch": 0.9246119733924612, "grad_norm": 0.0, "kl": 0.18332026898860931, "learning_rate": 4.609693081894537e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3336 }, { "completion_length": 459.0, "epoch": 0.924889135254989, "grad_norm": 0.0, "kl": 0.12550361454486847, "learning_rate": 4.609458158446834e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3337 }, { "completion_length": 443.0, "epoch": 0.9251662971175166, "grad_norm": 0.383851021528244, "kl": 0.16185711324214935, "learning_rate": 4.6092231703108245e-06, "loss": 0.0, "reward": 1.46875, "reward_std": 0.5625, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 3338 }, { "completion_length": 559.0, "epoch": 0.9254434589800443, "grad_norm": 0.0, "kl": 0.1834714710712433, "learning_rate": 4.6089881174937146e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3339 }, { "completion_length": 448.75, "epoch": 0.9257206208425721, "grad_norm": 0.4421597123146057, "kl": 0.18145620822906494, "learning_rate": 4.608753000002713e-06, "loss": -0.0, "reward": 3.5, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3340 }, { "completion_length": 434.75, "epoch": 0.9259977827050998, "grad_norm": 0.3971846401691437, "kl": 0.11441487073898315, "learning_rate": 4.608517817845028e-06, "loss": -0.0, "reward": 3.25, "reward_std": 1.9148542881011963, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3341 }, { "completion_length": 475.0, "epoch": 0.9262749445676275, "grad_norm": 0.0, "kl": 0.1571420580148697, "learning_rate": 4.6082825710278724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3342 }, { "completion_length": 445.5, "epoch": 0.9265521064301552, "grad_norm": 0.5221933722496033, "kl": 0.13087402284145355, "learning_rate": 4.608047259558462e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3343 }, { "completion_length": 484.75, "epoch": 0.926829268292683, "grad_norm": 0.418949156999588, "kl": 0.10737652331590652, "learning_rate": 4.60781188344401e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3344 }, { "completion_length": 475.25, "epoch": 0.9271064301552107, "grad_norm": 0.0, "kl": 0.12151284515857697, "learning_rate": 4.607576442691737e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3345 }, { "completion_length": 449.25, "epoch": 0.9273835920177383, "grad_norm": 0.4398902952671051, "kl": 0.13437630236148834, "learning_rate": 4.607340937308861e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3346 }, { "completion_length": 465.0, "epoch": 0.927660753880266, "grad_norm": 0.0, "kl": 0.119949109852314, "learning_rate": 4.607105367302605e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3347 }, { "completion_length": 392.0, "epoch": 0.9279379157427938, "grad_norm": 0.0, "kl": 0.17260901629924774, "learning_rate": 4.606869732680193e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3348 }, { "completion_length": 450.75, "epoch": 0.9282150776053215, "grad_norm": 0.0, "kl": 0.13477946817874908, "learning_rate": 4.6066340334488506e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3349 }, { "completion_length": 500.5, "epoch": 0.9284922394678492, "grad_norm": 0.0, "kl": 0.14067202806472778, "learning_rate": 4.606398269615805e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3350 }, { "completion_length": 422.5, "epoch": 0.9287694013303769, "grad_norm": 0.0, "kl": 0.1338777393102646, "learning_rate": 4.6061624411882874e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3351 }, { "completion_length": 444.0, "epoch": 0.9290465631929047, "grad_norm": 0.0, "kl": 0.14922523498535156, "learning_rate": 4.6059265481735295e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3352 }, { "completion_length": 497.5, "epoch": 0.9293237250554324, "grad_norm": 0.0, "kl": 0.14548376202583313, "learning_rate": 4.605690590578765e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3353 }, { "completion_length": 460.25, "epoch": 0.9296008869179601, "grad_norm": 0.0, "kl": 0.13078118860721588, "learning_rate": 4.605454568411228e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3354 }, { "completion_length": 513.75, "epoch": 0.9298780487804879, "grad_norm": 0.0, "kl": 0.12465842068195343, "learning_rate": 4.60521848167816e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3355 }, { "completion_length": 461.25, "epoch": 0.9301552106430155, "grad_norm": 0.0, "kl": 0.5945078134536743, "learning_rate": 4.604982330386797e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3356 }, { "completion_length": 495.5, "epoch": 0.9304323725055432, "grad_norm": 0.0, "kl": 0.129891037940979, "learning_rate": 4.6047461145443825e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3357 }, { "completion_length": 501.5, "epoch": 0.9307095343680709, "grad_norm": 0.3810967803001404, "kl": 0.11283277720212936, "learning_rate": 4.604509834158161e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3358 }, { "completion_length": 436.75, "epoch": 0.9309866962305987, "grad_norm": 0.4869295060634613, "kl": 0.1234297975897789, "learning_rate": 4.604273489235378e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3359 }, { "completion_length": 512.0, "epoch": 0.9312638580931264, "grad_norm": 0.0, "kl": 0.12043575942516327, "learning_rate": 4.604037079783279e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3360 }, { "completion_length": 493.75, "epoch": 0.9315410199556541, "grad_norm": 0.3987596333026886, "kl": 0.1318923830986023, "learning_rate": 4.603800605809117e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3361 }, { "completion_length": 452.25, "epoch": 0.9318181818181818, "grad_norm": 0.40178313851356506, "kl": 0.1306280791759491, "learning_rate": 4.603564067320141e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3362 }, { "completion_length": 533.75, "epoch": 0.9320953436807096, "grad_norm": 0.0, "kl": 0.10959338396787643, "learning_rate": 4.603327464323606e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3363 }, { "completion_length": 498.5, "epoch": 0.9323725055432373, "grad_norm": 0.0, "kl": 0.14391617476940155, "learning_rate": 4.603090796826767e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3364 }, { "completion_length": 513.75, "epoch": 0.9326496674057649, "grad_norm": 0.36634907126426697, "kl": 0.14709575474262238, "learning_rate": 4.602854064836881e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3365 }, { "completion_length": 505.75, "epoch": 0.9329268292682927, "grad_norm": 0.3849717974662781, "kl": 0.14213576912879944, "learning_rate": 4.602617268361211e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3366 }, { "completion_length": 398.25, "epoch": 0.9332039911308204, "grad_norm": 0.4538748264312744, "kl": 0.14713813364505768, "learning_rate": 4.602380407407014e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3367 }, { "completion_length": 496.25, "epoch": 0.9334811529933481, "grad_norm": 0.0, "kl": 0.1255313903093338, "learning_rate": 4.602143481981556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3368 }, { "completion_length": 505.5, "epoch": 0.9337583148558758, "grad_norm": 0.43303072452545166, "kl": 0.16853995621204376, "learning_rate": 4.601906492092102e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3369 }, { "completion_length": 554.0, "epoch": 0.9340354767184036, "grad_norm": 0.3648996651172638, "kl": 0.12719836831092834, "learning_rate": 4.6016694377459196e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3370 }, { "completion_length": 458.5, "epoch": 0.9343126385809313, "grad_norm": 0.0, "kl": 0.13905645906925201, "learning_rate": 4.601432318950279e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3371 }, { "completion_length": 513.5, "epoch": 0.934589800443459, "grad_norm": 0.0, "kl": 0.1312084197998047, "learning_rate": 4.60119513571245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3372 }, { "completion_length": 481.0, "epoch": 0.9348669623059866, "grad_norm": 0.42331892251968384, "kl": 0.12907564640045166, "learning_rate": 4.600957888039707e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3373 }, { "completion_length": 451.75, "epoch": 0.9351441241685144, "grad_norm": 0.0, "kl": 0.14629089832305908, "learning_rate": 4.600720575939326e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3374 }, { "completion_length": 426.75, "epoch": 0.9354212860310421, "grad_norm": 0.0, "kl": 0.13717173039913177, "learning_rate": 4.600483199418584e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3375 }, { "completion_length": 442.5, "epoch": 0.9356984478935698, "grad_norm": 0.35627293586730957, "kl": 0.13700933754444122, "learning_rate": 4.600245758484759e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3376 }, { "completion_length": 488.5, "epoch": 0.9359756097560976, "grad_norm": 0.0, "kl": 0.13531404733657837, "learning_rate": 4.600008253145134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3377 }, { "completion_length": 468.5, "epoch": 0.9362527716186253, "grad_norm": 0.0, "kl": 0.3192620873451233, "learning_rate": 4.599770683406992e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3378 }, { "completion_length": 510.25, "epoch": 0.936529933481153, "grad_norm": 0.0, "kl": 0.14776304364204407, "learning_rate": 4.599533049277617e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3379 }, { "completion_length": 504.75, "epoch": 0.9368070953436807, "grad_norm": 0.38598760962486267, "kl": 0.12129788100719452, "learning_rate": 4.599295350764298e-06, "loss": 0.0, "reward": 4.5, "reward_std": 1.8929693698883057, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3380 }, { "completion_length": 489.75, "epoch": 0.9370842572062085, "grad_norm": 0.0, "kl": 0.14477795362472534, "learning_rate": 4.599057587874323e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3381 }, { "completion_length": 494.75, "epoch": 0.9373614190687362, "grad_norm": 0.387358695268631, "kl": 0.16645501554012299, "learning_rate": 4.5988197606149845e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3382 }, { "completion_length": 515.0, "epoch": 0.9376385809312638, "grad_norm": 0.0, "kl": 0.13566336035728455, "learning_rate": 4.598581868993573e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3383 }, { "completion_length": 441.25, "epoch": 0.9379157427937915, "grad_norm": 0.4294794201850891, "kl": 0.16461032629013062, "learning_rate": 4.598343913017387e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3384 }, { "completion_length": 485.75, "epoch": 0.9381929046563193, "grad_norm": 0.3939123749732971, "kl": 0.13481411337852478, "learning_rate": 4.5981058926937215e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3385 }, { "completion_length": 498.5, "epoch": 0.938470066518847, "grad_norm": 0.3736973702907562, "kl": 0.13954094052314758, "learning_rate": 4.597867808029876e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3386 }, { "completion_length": 449.25, "epoch": 0.9387472283813747, "grad_norm": 0.0, "kl": 0.13513803482055664, "learning_rate": 4.597629659033153e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3387 }, { "completion_length": 481.0, "epoch": 0.9390243902439024, "grad_norm": 0.376707524061203, "kl": 0.15155129134655, "learning_rate": 4.597391445710853e-06, "loss": 0.0, "reward": 4.0, "reward_std": 2.0615527629852295, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3388 }, { "completion_length": 486.5, "epoch": 0.9393015521064302, "grad_norm": 0.0, "kl": 0.12112656235694885, "learning_rate": 4.597153168070283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3389 }, { "completion_length": 454.0, "epoch": 0.9395787139689579, "grad_norm": 0.0, "kl": 0.15165507793426514, "learning_rate": 4.596914826118749e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3390 }, { "completion_length": 431.25, "epoch": 0.9398558758314856, "grad_norm": 0.0, "kl": 0.16044634580612183, "learning_rate": 4.596676419863561e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3391 }, { "completion_length": 468.25, "epoch": 0.9401330376940134, "grad_norm": 0.45784974098205566, "kl": 0.15325017273426056, "learning_rate": 4.596437949312028e-06, "loss": -0.0, "reward": 4.875, "reward_std": 1.75, "rewards/confident_score_func": 1.625, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3392 }, { "completion_length": 507.5, "epoch": 0.940410199556541, "grad_norm": 0.40314266085624695, "kl": 0.12264711409807205, "learning_rate": 4.596199414471464e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3393 }, { "completion_length": 451.0, "epoch": 0.9406873614190687, "grad_norm": 0.42326343059539795, "kl": 0.1620166152715683, "learning_rate": 4.595960815349185e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3394 }, { "completion_length": 423.75, "epoch": 0.9409645232815964, "grad_norm": 0.0, "kl": 0.1812472641468048, "learning_rate": 4.595722151952506e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3395 }, { "completion_length": 496.5, "epoch": 0.9412416851441242, "grad_norm": 0.0, "kl": 0.137712299823761, "learning_rate": 4.595483424288747e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3396 }, { "completion_length": 556.25, "epoch": 0.9415188470066519, "grad_norm": 0.0, "kl": 0.18805661797523499, "learning_rate": 4.595244632365228e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3397 }, { "completion_length": 487.5, "epoch": 0.9417960088691796, "grad_norm": 0.4202888309955597, "kl": 0.15230990946292877, "learning_rate": 4.595005776189272e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3398 }, { "completion_length": 430.25, "epoch": 0.9420731707317073, "grad_norm": 0.0, "kl": 0.23230044543743134, "learning_rate": 4.594766855768203e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3399 }, { "completion_length": 442.25, "epoch": 0.9423503325942351, "grad_norm": 0.0, "kl": 0.13620220124721527, "learning_rate": 4.594527871109351e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3400 }, { "completion_length": 554.5, "epoch": 0.9426274944567627, "grad_norm": 0.0, "kl": 0.13192135095596313, "learning_rate": 4.59428882222004e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3401 }, { "completion_length": 448.25, "epoch": 0.9429046563192904, "grad_norm": 0.3954842686653137, "kl": 0.17037321627140045, "learning_rate": 4.594049709107604e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3402 }, { "completion_length": 474.25, "epoch": 0.9431818181818182, "grad_norm": 0.0, "kl": 0.13539136946201324, "learning_rate": 4.593810531779374e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3403 }, { "completion_length": 453.75, "epoch": 0.9434589800443459, "grad_norm": 0.0, "kl": 0.15139350295066833, "learning_rate": 4.593571290242685e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3404 }, { "completion_length": 508.75, "epoch": 0.9437361419068736, "grad_norm": 0.4306130111217499, "kl": 0.1441994160413742, "learning_rate": 4.593331984504874e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3405 }, { "completion_length": 572.0, "epoch": 0.9440133037694013, "grad_norm": 0.0, "kl": 0.14866457879543304, "learning_rate": 4.593092614573279e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3406 }, { "completion_length": 479.0, "epoch": 0.9442904656319291, "grad_norm": 0.3984682261943817, "kl": 0.15168310701847076, "learning_rate": 4.59285318045524e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3407 }, { "completion_length": 479.0, "epoch": 0.9445676274944568, "grad_norm": 0.0, "kl": 0.12297317385673523, "learning_rate": 4.5926136821581006e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3408 }, { "completion_length": 480.25, "epoch": 0.9448447893569845, "grad_norm": 0.0, "kl": 0.1550597846508026, "learning_rate": 4.5923741196892046e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3409 }, { "completion_length": 523.5, "epoch": 0.9451219512195121, "grad_norm": 0.0, "kl": 0.1257329285144806, "learning_rate": 4.592134493055898e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3410 }, { "completion_length": 454.0, "epoch": 0.9453991130820399, "grad_norm": 0.3750206232070923, "kl": 0.11471589654684067, "learning_rate": 4.59189480226553e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3411 }, { "completion_length": 448.25, "epoch": 0.9456762749445676, "grad_norm": 0.0, "kl": 0.14978088438510895, "learning_rate": 4.591655047325451e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3412 }, { "completion_length": 456.5, "epoch": 0.9459534368070953, "grad_norm": 0.4344111680984497, "kl": 0.22396181523799896, "learning_rate": 4.591415228243012e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3413 }, { "completion_length": 523.75, "epoch": 0.9462305986696231, "grad_norm": 0.3520444929599762, "kl": 0.1348995715379715, "learning_rate": 4.591175345025567e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3414 }, { "completion_length": 450.5, "epoch": 0.9465077605321508, "grad_norm": 0.0, "kl": 0.14880941808223724, "learning_rate": 4.590935397680474e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3415 }, { "completion_length": 482.75, "epoch": 0.9467849223946785, "grad_norm": 0.39604392647743225, "kl": 0.11613249033689499, "learning_rate": 4.590695386215091e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3416 }, { "completion_length": 451.5, "epoch": 0.9470620842572062, "grad_norm": 0.0, "kl": 0.1714836061000824, "learning_rate": 4.590455310636778e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3417 }, { "completion_length": 494.0, "epoch": 0.947339246119734, "grad_norm": 0.3474384546279907, "kl": 0.15026500821113586, "learning_rate": 4.590215170952895e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3418 }, { "completion_length": 571.75, "epoch": 0.9476164079822617, "grad_norm": 0.0, "kl": 0.11899064481258392, "learning_rate": 4.58997496717081e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3419 }, { "completion_length": 464.75, "epoch": 0.9478935698447893, "grad_norm": 0.0, "kl": 0.14324349164962769, "learning_rate": 4.5897346992978855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3420 }, { "completion_length": 433.75, "epoch": 0.948170731707317, "grad_norm": 0.0, "kl": 0.15633732080459595, "learning_rate": 4.58949436734149e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3421 }, { "completion_length": 450.0, "epoch": 0.9484478935698448, "grad_norm": 0.0, "kl": 0.17511729896068573, "learning_rate": 4.589253971308995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3422 }, { "completion_length": 428.75, "epoch": 0.9487250554323725, "grad_norm": 0.0, "kl": 0.1649726778268814, "learning_rate": 4.589013511207773e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3423 }, { "completion_length": 456.25, "epoch": 0.9490022172949002, "grad_norm": 0.0, "kl": 0.13604186475276947, "learning_rate": 4.5887729870451955e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3424 }, { "completion_length": 456.5, "epoch": 0.9492793791574279, "grad_norm": 0.4126867353916168, "kl": 0.16142232716083527, "learning_rate": 4.58853239882864e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3425 }, { "completion_length": 464.0, "epoch": 0.9495565410199557, "grad_norm": 0.3612980544567108, "kl": 0.11454497277736664, "learning_rate": 4.588291746565484e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3426 }, { "completion_length": 543.5, "epoch": 0.9498337028824834, "grad_norm": 0.0, "kl": 0.2549920380115509, "learning_rate": 4.588051030263107e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3427 }, { "completion_length": 438.0, "epoch": 0.950110864745011, "grad_norm": 0.43848180770874023, "kl": 0.13825568556785583, "learning_rate": 4.587810249928891e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3428 }, { "completion_length": 484.25, "epoch": 0.9503880266075388, "grad_norm": 0.4087885916233063, "kl": 0.36182937026023865, "learning_rate": 4.587569405570219e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3429 }, { "completion_length": 373.75, "epoch": 0.9506651884700665, "grad_norm": 0.5017073750495911, "kl": 0.18405704200267792, "learning_rate": 4.587328497194478e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3430 }, { "completion_length": 442.75, "epoch": 0.9509423503325942, "grad_norm": 0.0, "kl": 0.13890695571899414, "learning_rate": 4.587087524809055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3431 }, { "completion_length": 532.5, "epoch": 0.9512195121951219, "grad_norm": 0.3264912962913513, "kl": 0.12365707755088806, "learning_rate": 4.58684648842134e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3432 }, { "completion_length": 512.5, "epoch": 0.9514966740576497, "grad_norm": 0.37712782621383667, "kl": 0.1617886871099472, "learning_rate": 4.586605388038724e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3433 }, { "completion_length": 482.75, "epoch": 0.9517738359201774, "grad_norm": 0.0, "kl": 0.2065555900335312, "learning_rate": 4.586364223668601e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3434 }, { "completion_length": 467.0, "epoch": 0.9520509977827051, "grad_norm": 0.4068586528301239, "kl": 0.1390436440706253, "learning_rate": 4.586122995318366e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3435 }, { "completion_length": 561.5, "epoch": 0.9523281596452328, "grad_norm": 0.38086241483688354, "kl": 0.1681165248155594, "learning_rate": 4.585881702995417e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3436 }, { "completion_length": 483.75, "epoch": 0.9526053215077606, "grad_norm": 0.0, "kl": 0.12334839254617691, "learning_rate": 4.585640346707153e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3437 }, { "completion_length": 513.75, "epoch": 0.9528824833702882, "grad_norm": 0.0, "kl": 0.1339377760887146, "learning_rate": 4.585398926460976e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3438 }, { "completion_length": 488.75, "epoch": 0.9531596452328159, "grad_norm": 0.0, "kl": 0.14180631935596466, "learning_rate": 4.58515744226429e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3439 }, { "completion_length": 441.75, "epoch": 0.9534368070953437, "grad_norm": 0.0, "kl": 0.18097485601902008, "learning_rate": 4.584915894124498e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3440 }, { "completion_length": 505.25, "epoch": 0.9537139689578714, "grad_norm": 0.0, "kl": 0.15538841485977173, "learning_rate": 4.5846742820490085e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3441 }, { "completion_length": 421.25, "epoch": 0.9539911308203991, "grad_norm": 0.0, "kl": 0.17410226166248322, "learning_rate": 4.58443260604523e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3442 }, { "completion_length": 491.5, "epoch": 0.9542682926829268, "grad_norm": 0.4503602385520935, "kl": 0.12669266760349274, "learning_rate": 4.584190866120576e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3443 }, { "completion_length": 450.75, "epoch": 0.9545454545454546, "grad_norm": 0.0, "kl": 0.17807143926620483, "learning_rate": 4.583949062282458e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3444 }, { "completion_length": 452.5, "epoch": 0.9548226164079823, "grad_norm": 0.0, "kl": 0.14601409435272217, "learning_rate": 4.583707194538291e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3445 }, { "completion_length": 441.0, "epoch": 0.95509977827051, "grad_norm": 0.4242972135543823, "kl": 0.18564705550670624, "learning_rate": 4.5834652628954915e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3446 }, { "completion_length": 459.75, "epoch": 0.9553769401330376, "grad_norm": 0.0, "kl": 0.1840646117925644, "learning_rate": 4.58322326736148e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3447 }, { "completion_length": 485.75, "epoch": 0.9556541019955654, "grad_norm": 0.3943822681903839, "kl": 0.14821237325668335, "learning_rate": 4.582981207943678e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3448 }, { "completion_length": 456.5, "epoch": 0.9559312638580931, "grad_norm": 0.4092123806476593, "kl": 0.20437972247600555, "learning_rate": 4.582739084649506e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3449 }, { "completion_length": 484.0, "epoch": 0.9562084257206208, "grad_norm": 0.40332648158073425, "kl": 0.13828861713409424, "learning_rate": 4.582496897486391e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3450 }, { "completion_length": 447.5, "epoch": 0.9564855875831486, "grad_norm": 0.0, "kl": 0.19180649518966675, "learning_rate": 4.582254646461758e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3451 }, { "completion_length": 533.5, "epoch": 0.9567627494456763, "grad_norm": 0.0, "kl": 0.18072131276130676, "learning_rate": 4.582012331583038e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3452 }, { "completion_length": 389.0, "epoch": 0.957039911308204, "grad_norm": 0.4520835876464844, "kl": 0.17003510892391205, "learning_rate": 4.58176995285766e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3453 }, { "completion_length": 450.75, "epoch": 0.9573170731707317, "grad_norm": 0.0, "kl": 0.1378811150789261, "learning_rate": 4.5815275102930575e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3454 }, { "completion_length": 515.0, "epoch": 0.9575942350332595, "grad_norm": 0.45748671889305115, "kl": 0.1578921675682068, "learning_rate": 4.581285003896666e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3455 }, { "completion_length": 477.5, "epoch": 0.9578713968957872, "grad_norm": 0.3762809932231903, "kl": 0.14248107373714447, "learning_rate": 4.58104243367592e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3456 }, { "completion_length": 479.5, "epoch": 0.9581485587583148, "grad_norm": 0.0, "kl": 0.14031882584095, "learning_rate": 4.580799799638261e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3457 }, { "completion_length": 494.5, "epoch": 0.9584257206208425, "grad_norm": 0.0, "kl": 0.2746296226978302, "learning_rate": 4.580557101791128e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3458 }, { "completion_length": 470.5, "epoch": 0.9587028824833703, "grad_norm": 0.0, "kl": 0.1495225727558136, "learning_rate": 4.5803143401419635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3459 }, { "completion_length": 437.25, "epoch": 0.958980044345898, "grad_norm": 0.4885742664337158, "kl": 0.2751595377922058, "learning_rate": 4.580071514698211e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3460 }, { "completion_length": 560.25, "epoch": 0.9592572062084257, "grad_norm": 0.0, "kl": 0.1441262811422348, "learning_rate": 4.579828625467319e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3461 }, { "completion_length": 512.0, "epoch": 0.9595343680709535, "grad_norm": 0.0, "kl": 0.13254621624946594, "learning_rate": 4.5795856724567344e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3462 }, { "completion_length": 418.0, "epoch": 0.9598115299334812, "grad_norm": 0.0, "kl": 0.17555034160614014, "learning_rate": 4.5793426556739085e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3463 }, { "completion_length": 426.0, "epoch": 0.9600886917960089, "grad_norm": 0.4359709620475769, "kl": 0.150188609957695, "learning_rate": 4.579099575126293e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3464 }, { "completion_length": 462.0, "epoch": 0.9603658536585366, "grad_norm": 0.4282473027706146, "kl": 2.761972665786743, "learning_rate": 4.5788564308213425e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3465 }, { "completion_length": 450.5, "epoch": 0.9606430155210643, "grad_norm": 0.4103180766105652, "kl": 0.16961215436458588, "learning_rate": 4.578613222766513e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3466 }, { "completion_length": 471.25, "epoch": 0.960920177383592, "grad_norm": 0.45663249492645264, "kl": 0.28757962584495544, "learning_rate": 4.578369950969263e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3467 }, { "completion_length": 415.25, "epoch": 0.9611973392461197, "grad_norm": 0.4274783134460449, "kl": 0.1733887791633606, "learning_rate": 4.578126615437052e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3468 }, { "completion_length": 480.0, "epoch": 0.9614745011086474, "grad_norm": 0.0, "kl": 0.17598474025726318, "learning_rate": 4.577883216177342e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3469 }, { "completion_length": 440.5, "epoch": 0.9617516629711752, "grad_norm": 0.4227040708065033, "kl": 0.14150671660900116, "learning_rate": 4.577639753197599e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3470 }, { "completion_length": 464.5, "epoch": 0.9620288248337029, "grad_norm": 0.37721776962280273, "kl": 0.20434734225273132, "learning_rate": 4.5773962265052866e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3471 }, { "completion_length": 464.75, "epoch": 0.9623059866962306, "grad_norm": 0.379217267036438, "kl": 0.15024183690547943, "learning_rate": 4.577152636107874e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3472 }, { "completion_length": 482.25, "epoch": 0.9625831485587583, "grad_norm": 0.3848246932029724, "kl": 0.18263401091098785, "learning_rate": 4.576908982012831e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3473 }, { "completion_length": 406.0, "epoch": 0.9628603104212861, "grad_norm": 0.0, "kl": 0.1820678859949112, "learning_rate": 4.576665264227627e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3474 }, { "completion_length": 499.5, "epoch": 0.9631374722838137, "grad_norm": 0.38649243116378784, "kl": 0.4480830132961273, "learning_rate": 4.576421482759741e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3475 }, { "completion_length": 516.25, "epoch": 0.9634146341463414, "grad_norm": 0.3922850489616394, "kl": 0.3118031919002533, "learning_rate": 4.576177637616645e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3476 }, { "completion_length": 539.25, "epoch": 0.9636917960088692, "grad_norm": 0.0, "kl": 0.13321684300899506, "learning_rate": 4.575933728805817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3477 }, { "completion_length": 491.5, "epoch": 0.9639689578713969, "grad_norm": 0.0, "kl": 0.12579979002475739, "learning_rate": 4.575689756334737e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3478 }, { "completion_length": 447.75, "epoch": 0.9642461197339246, "grad_norm": 0.0, "kl": 0.13861671090126038, "learning_rate": 4.5754457202108884e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3479 }, { "completion_length": 437.5, "epoch": 0.9645232815964523, "grad_norm": 0.46729734539985657, "kl": 0.14426426589488983, "learning_rate": 4.575201620441752e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3480 }, { "completion_length": 441.0, "epoch": 0.9648004434589801, "grad_norm": 0.0, "kl": 0.1650833636522293, "learning_rate": 4.574957457034814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3481 }, { "completion_length": 496.25, "epoch": 0.9650776053215078, "grad_norm": 0.4021226167678833, "kl": 0.1858244687318802, "learning_rate": 4.5747132299975634e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3482 }, { "completion_length": 462.25, "epoch": 0.9653547671840355, "grad_norm": 0.43042314052581787, "kl": 0.12856818735599518, "learning_rate": 4.574468939337488e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3483 }, { "completion_length": 414.0, "epoch": 0.9656319290465631, "grad_norm": 0.4107063114643097, "kl": 0.22531776130199432, "learning_rate": 4.574224585062081e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3484 }, { "completion_length": 507.25, "epoch": 0.9659090909090909, "grad_norm": 0.0, "kl": 0.139811173081398, "learning_rate": 4.573980167178833e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3485 }, { "completion_length": 425.25, "epoch": 0.9661862527716186, "grad_norm": 0.4442485272884369, "kl": 0.2022966742515564, "learning_rate": 4.573735685695242e-06, "loss": -0.0, "reward": 3.625, "reward_std": 2.462214469909668, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3486 }, { "completion_length": 555.5, "epoch": 0.9664634146341463, "grad_norm": 0.0, "kl": 0.11347795277833939, "learning_rate": 4.573491140618803e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3487 }, { "completion_length": 433.25, "epoch": 0.9667405764966741, "grad_norm": 0.0, "kl": 0.13964693248271942, "learning_rate": 4.573246531957016e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3488 }, { "completion_length": 456.75, "epoch": 0.9670177383592018, "grad_norm": 0.0, "kl": 0.15795336663722992, "learning_rate": 4.573001859717384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3489 }, { "completion_length": 492.0, "epoch": 0.9672949002217295, "grad_norm": 0.0, "kl": 0.1313278079032898, "learning_rate": 4.572757123907407e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3490 }, { "completion_length": 506.0, "epoch": 0.9675720620842572, "grad_norm": 0.3750755786895752, "kl": 0.14011229574680328, "learning_rate": 4.572512324534592e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3491 }, { "completion_length": 498.75, "epoch": 0.967849223946785, "grad_norm": 0.3824603855609894, "kl": 0.12283740192651749, "learning_rate": 4.572267461606446e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3492 }, { "completion_length": 408.5, "epoch": 0.9681263858093127, "grad_norm": 0.0, "kl": 0.163011834025383, "learning_rate": 4.572022535130476e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3493 }, { "completion_length": 455.0, "epoch": 0.9684035476718403, "grad_norm": 0.0, "kl": 0.1829557716846466, "learning_rate": 4.5717775451141955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3494 }, { "completion_length": 439.5, "epoch": 0.968680709534368, "grad_norm": 0.4120931625366211, "kl": 0.42452389001846313, "learning_rate": 4.571532491565115e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3495 }, { "completion_length": 500.25, "epoch": 0.9689578713968958, "grad_norm": 0.44233036041259766, "kl": 0.22743600606918335, "learning_rate": 4.571287374490751e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3496 }, { "completion_length": 485.25, "epoch": 0.9692350332594235, "grad_norm": 0.4256434738636017, "kl": 0.14230766892433167, "learning_rate": 4.571042193898619e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3497 }, { "completion_length": 482.0, "epoch": 0.9695121951219512, "grad_norm": 0.34670889377593994, "kl": 0.1840604841709137, "learning_rate": 4.57079694979624e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3498 }, { "completion_length": 510.5, "epoch": 0.969789356984479, "grad_norm": 0.0, "kl": 0.13383714854717255, "learning_rate": 4.570551642191131e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3499 }, { "completion_length": 483.5, "epoch": 0.9700665188470067, "grad_norm": 0.0, "kl": 0.13536104559898376, "learning_rate": 4.570306271090818e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3500 }, { "completion_length": 527.0, "epoch": 0.9703436807095344, "grad_norm": 0.37039855122566223, "kl": 0.1619718372821808, "learning_rate": 4.570060836502822e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3501 }, { "completion_length": 470.25, "epoch": 0.970620842572062, "grad_norm": 0.0, "kl": 0.16567440330982208, "learning_rate": 4.569815338434672e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3502 }, { "completion_length": 496.75, "epoch": 0.9708980044345898, "grad_norm": 0.0, "kl": 0.13994047045707703, "learning_rate": 4.569569776893897e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3503 }, { "completion_length": 467.75, "epoch": 0.9711751662971175, "grad_norm": 0.4097515344619751, "kl": 0.12980087101459503, "learning_rate": 4.569324151888025e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3504 }, { "completion_length": 542.0, "epoch": 0.9714523281596452, "grad_norm": 0.3202982246875763, "kl": 0.11641064286231995, "learning_rate": 4.56907846342459e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3505 }, { "completion_length": 536.75, "epoch": 0.9717294900221729, "grad_norm": 0.3983082175254822, "kl": 0.12794269621372223, "learning_rate": 4.568832711511125e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3506 }, { "completion_length": 500.0, "epoch": 0.9720066518847007, "grad_norm": 0.0, "kl": 0.1828085035085678, "learning_rate": 4.568586896155167e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3507 }, { "completion_length": 526.5, "epoch": 0.9722838137472284, "grad_norm": 0.3451041579246521, "kl": 0.16702547669410706, "learning_rate": 4.568341017364254e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3508 }, { "completion_length": 499.25, "epoch": 0.9725609756097561, "grad_norm": 0.360244482755661, "kl": 0.14179514348506927, "learning_rate": 4.568095075145927e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3509 }, { "completion_length": 483.75, "epoch": 0.9728381374722838, "grad_norm": 0.37412694096565247, "kl": 0.15304270386695862, "learning_rate": 4.567849069507726e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3510 }, { "completion_length": 444.75, "epoch": 0.9731152993348116, "grad_norm": 0.0, "kl": 0.14507892727851868, "learning_rate": 4.567603000457197e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3511 }, { "completion_length": 488.5, "epoch": 0.9733924611973392, "grad_norm": 0.45784780383110046, "kl": 0.13192781805992126, "learning_rate": 4.567356868001884e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3512 }, { "completion_length": 447.25, "epoch": 0.9736696230598669, "grad_norm": 0.0, "kl": 0.13542337715625763, "learning_rate": 4.567110672149337e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3513 }, { "completion_length": 510.0, "epoch": 0.9739467849223947, "grad_norm": 0.0, "kl": 0.6244128942489624, "learning_rate": 4.566864412907104e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3514 }, { "completion_length": 471.75, "epoch": 0.9742239467849224, "grad_norm": 0.389555424451828, "kl": 0.14238573610782623, "learning_rate": 4.566618090282737e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3515 }, { "completion_length": 454.25, "epoch": 0.9745011086474501, "grad_norm": 0.47187405824661255, "kl": 0.15672236680984497, "learning_rate": 4.56637170428379e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3516 }, { "completion_length": 451.5, "epoch": 0.9747782705099778, "grad_norm": 0.40186643600463867, "kl": 0.23867681622505188, "learning_rate": 4.566125254917819e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3517 }, { "completion_length": 502.0, "epoch": 0.9750554323725056, "grad_norm": 0.35653531551361084, "kl": 0.12364272773265839, "learning_rate": 4.565878742192382e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3518 }, { "completion_length": 439.5, "epoch": 0.9753325942350333, "grad_norm": 0.3921535909175873, "kl": 0.15646016597747803, "learning_rate": 4.565632166115037e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3519 }, { "completion_length": 454.75, "epoch": 0.975609756097561, "grad_norm": 0.36241698265075684, "kl": 0.11409993469715118, "learning_rate": 4.565385526693347e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3520 }, { "completion_length": 457.75, "epoch": 0.9758869179600886, "grad_norm": 0.40538060665130615, "kl": 0.11304226517677307, "learning_rate": 4.565138823934874e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3521 }, { "completion_length": 472.0, "epoch": 0.9761640798226164, "grad_norm": 0.37782198190689087, "kl": 0.1299373209476471, "learning_rate": 4.564892057847184e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3522 }, { "completion_length": 431.25, "epoch": 0.9764412416851441, "grad_norm": 0.0, "kl": 0.1388789415359497, "learning_rate": 4.564645228437845e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3523 }, { "completion_length": 512.0, "epoch": 0.9767184035476718, "grad_norm": 0.3975660502910614, "kl": 0.12190110236406326, "learning_rate": 4.564398335714426e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3524 }, { "completion_length": 499.5, "epoch": 0.9769955654101996, "grad_norm": 0.37152159214019775, "kl": 0.13818016648292542, "learning_rate": 4.564151379684497e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3525 }, { "completion_length": 434.75, "epoch": 0.9772727272727273, "grad_norm": 0.4298805892467499, "kl": 0.1760188192129135, "learning_rate": 4.563904360355631e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3526 }, { "completion_length": 484.5, "epoch": 0.977549889135255, "grad_norm": 0.3666607439517975, "kl": 0.4160931706428528, "learning_rate": 4.563657277735405e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3527 }, { "completion_length": 472.0, "epoch": 0.9778270509977827, "grad_norm": 0.0, "kl": 0.2233985960483551, "learning_rate": 4.563410131831395e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3528 }, { "completion_length": 519.25, "epoch": 0.9781042128603105, "grad_norm": 0.0, "kl": 0.12287114560604095, "learning_rate": 4.563162922651178e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3529 }, { "completion_length": 529.75, "epoch": 0.9783813747228381, "grad_norm": 0.4390554130077362, "kl": 0.1178613156080246, "learning_rate": 4.562915650202339e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3530 }, { "completion_length": 473.5, "epoch": 0.9786585365853658, "grad_norm": 0.0, "kl": 0.3910660445690155, "learning_rate": 4.562668314492458e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3531 }, { "completion_length": 521.25, "epoch": 0.9789356984478935, "grad_norm": 0.0, "kl": 0.11485502123832703, "learning_rate": 4.56242091552912e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3532 }, { "completion_length": 512.0, "epoch": 0.9792128603104213, "grad_norm": 0.0, "kl": 0.3336574137210846, "learning_rate": 4.562173453319911e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3533 }, { "completion_length": 416.5, "epoch": 0.979490022172949, "grad_norm": 0.0, "kl": 0.11669348925352097, "learning_rate": 4.561925927872421e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3534 }, { "completion_length": 437.0, "epoch": 0.9797671840354767, "grad_norm": 0.3792208433151245, "kl": 0.11251508444547653, "learning_rate": 4.561678339194242e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3535 }, { "completion_length": 486.5, "epoch": 0.9800443458980045, "grad_norm": 0.0, "kl": 0.11920589953660965, "learning_rate": 4.5614306872929625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3536 }, { "completion_length": 506.0, "epoch": 0.9803215077605322, "grad_norm": 0.0, "kl": 0.1019008606672287, "learning_rate": 4.5611829721761804e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3537 }, { "completion_length": 467.5, "epoch": 0.9805986696230599, "grad_norm": 0.3640437722206116, "kl": 0.13335342705249786, "learning_rate": 4.560935193851491e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3538 }, { "completion_length": 480.0, "epoch": 0.9808758314855875, "grad_norm": 0.0, "kl": 0.14212730526924133, "learning_rate": 4.5606873523264915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3539 }, { "completion_length": 496.0, "epoch": 0.9811529933481153, "grad_norm": 0.0, "kl": 0.12231235206127167, "learning_rate": 4.560439447608784e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3540 }, { "completion_length": 495.5, "epoch": 0.981430155210643, "grad_norm": 0.40092045068740845, "kl": 0.15118680894374847, "learning_rate": 4.560191479705969e-06, "loss": -0.0, "reward": 3.0625, "reward_std": 3.3000946044921875, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5625, "step": 3541 }, { "completion_length": 498.0, "epoch": 0.9817073170731707, "grad_norm": 0.399323046207428, "kl": 0.14012745022773743, "learning_rate": 4.559943448625652e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3542 }, { "completion_length": 471.75, "epoch": 0.9819844789356984, "grad_norm": 0.0, "kl": 0.11213894188404083, "learning_rate": 4.5596953543754385e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3543 }, { "completion_length": 519.25, "epoch": 0.9822616407982262, "grad_norm": 0.0, "kl": 0.11590079218149185, "learning_rate": 4.559447196962937e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3544 }, { "completion_length": 503.0, "epoch": 0.9825388026607539, "grad_norm": 0.0, "kl": 0.13676537573337555, "learning_rate": 4.559198976395757e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3545 }, { "completion_length": 474.75, "epoch": 0.9828159645232816, "grad_norm": 0.501807689666748, "kl": 0.17656446993350983, "learning_rate": 4.558950692681509e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3546 }, { "completion_length": 714.75, "epoch": 0.9830931263858093, "grad_norm": 0.23882164061069489, "kl": 0.09487888962030411, "learning_rate": 4.55870234582781e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3547 }, { "completion_length": 484.5, "epoch": 0.983370288248337, "grad_norm": 0.0, "kl": 0.10664540529251099, "learning_rate": 4.558453935842274e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3548 }, { "completion_length": 397.0, "epoch": 0.9836474501108647, "grad_norm": 0.0, "kl": 0.13421201705932617, "learning_rate": 4.558205462732518e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3549 }, { "completion_length": 478.0, "epoch": 0.9839246119733924, "grad_norm": 0.36769330501556396, "kl": 0.13715872168540955, "learning_rate": 4.557956926506163e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3550 }, { "completion_length": 564.75, "epoch": 0.9842017738359202, "grad_norm": 0.3655608296394348, "kl": 0.10351027548313141, "learning_rate": 4.5577083271708294e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3551 }, { "completion_length": 589.5, "epoch": 0.9844789356984479, "grad_norm": 0.0, "kl": 0.25285887718200684, "learning_rate": 4.5574596647341414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3552 }, { "completion_length": 487.25, "epoch": 0.9847560975609756, "grad_norm": 0.0, "kl": 0.12587977945804596, "learning_rate": 4.557210939203724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3553 }, { "completion_length": 515.75, "epoch": 0.9850332594235033, "grad_norm": 0.38148561120033264, "kl": 0.1283973753452301, "learning_rate": 4.5569621505872055e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3554 }, { "completion_length": 408.0, "epoch": 0.9853104212860311, "grad_norm": 0.4653277099132538, "kl": 0.29783061146736145, "learning_rate": 4.556713298892215e-06, "loss": 0.0, "reward": 4.0625, "reward_std": 3.375, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5625, "step": 3555 }, { "completion_length": 548.25, "epoch": 0.9855875831485588, "grad_norm": 0.35223135352134705, "kl": 0.12714309990406036, "learning_rate": 4.556464384126382e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3556 }, { "completion_length": 558.75, "epoch": 0.9858647450110865, "grad_norm": 0.0, "kl": 0.23350785672664642, "learning_rate": 4.5562154062973415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3557 }, { "completion_length": 503.25, "epoch": 0.9861419068736141, "grad_norm": 0.3463461101055145, "kl": 0.10630585998296738, "learning_rate": 4.555966365412729e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3558 }, { "completion_length": 438.75, "epoch": 0.9864190687361419, "grad_norm": 0.0, "kl": 0.1773146539926529, "learning_rate": 4.555717261480179e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3559 }, { "completion_length": 511.5, "epoch": 0.9866962305986696, "grad_norm": 0.0, "kl": 0.10979227721691132, "learning_rate": 4.555468094507334e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3560 }, { "completion_length": 482.25, "epoch": 0.9869733924611973, "grad_norm": 0.0, "kl": 0.1468292474746704, "learning_rate": 4.555218864501832e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3561 }, { "completion_length": 496.75, "epoch": 0.9872505543237251, "grad_norm": 0.0, "kl": 0.1368822306394577, "learning_rate": 4.554969571471317e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3562 }, { "completion_length": 421.0, "epoch": 0.9875277161862528, "grad_norm": 0.387855589389801, "kl": 0.1175895631313324, "learning_rate": 4.554720215423433e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3563 }, { "completion_length": 583.5, "epoch": 0.9878048780487805, "grad_norm": 0.3262958228588104, "kl": 0.12131108343601227, "learning_rate": 4.554470796365829e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3564 }, { "completion_length": 486.0, "epoch": 0.9880820399113082, "grad_norm": 0.3253732919692993, "kl": 0.11687831580638885, "learning_rate": 4.554221314306151e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3565 }, { "completion_length": 462.5, "epoch": 0.988359201773836, "grad_norm": 0.5688585042953491, "kl": 0.11447440832853317, "learning_rate": 4.553971769252051e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3566 }, { "completion_length": 485.0, "epoch": 0.9886363636363636, "grad_norm": 0.5291939377784729, "kl": 0.1948584020137787, "learning_rate": 4.553722161211182e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3567 }, { "completion_length": 554.5, "epoch": 0.9889135254988913, "grad_norm": 0.0, "kl": 0.1272365301847458, "learning_rate": 4.553472490191196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3568 }, { "completion_length": 468.0, "epoch": 0.989190687361419, "grad_norm": 0.3998660147190094, "kl": 0.11886312812566757, "learning_rate": 4.553222756199752e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3569 }, { "completion_length": 519.25, "epoch": 0.9894678492239468, "grad_norm": 0.3459624946117401, "kl": 0.11935350298881531, "learning_rate": 4.552972959244507e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3570 }, { "completion_length": 479.25, "epoch": 0.9897450110864745, "grad_norm": 0.0, "kl": 0.12602034211158752, "learning_rate": 4.552723099333121e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3571 }, { "completion_length": 495.0, "epoch": 0.9900221729490022, "grad_norm": 0.3431428372859955, "kl": 0.11724621802568436, "learning_rate": 4.552473176473258e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3572 }, { "completion_length": 414.75, "epoch": 0.99029933481153, "grad_norm": 0.0, "kl": 0.1796264797449112, "learning_rate": 4.55222319067258e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3573 }, { "completion_length": 653.25, "epoch": 0.9905764966740577, "grad_norm": 0.0, "kl": 0.1068972796201706, "learning_rate": 4.551973141938753e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3574 }, { "completion_length": 524.25, "epoch": 0.9908536585365854, "grad_norm": 0.35726839303970337, "kl": 0.11842586100101471, "learning_rate": 4.551723030279446e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3575 }, { "completion_length": 477.75, "epoch": 0.991130820399113, "grad_norm": 0.0, "kl": 0.5648365020751953, "learning_rate": 4.551472855702329e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3576 }, { "completion_length": 566.25, "epoch": 0.9914079822616408, "grad_norm": 0.0, "kl": 0.09380242228507996, "learning_rate": 4.551222618215073e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3577 }, { "completion_length": 443.5, "epoch": 0.9916851441241685, "grad_norm": 0.0, "kl": 0.1271849274635315, "learning_rate": 4.550972317825353e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3578 }, { "completion_length": 484.25, "epoch": 0.9919623059866962, "grad_norm": 0.0, "kl": 0.22115077078342438, "learning_rate": 4.550721954540843e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3579 }, { "completion_length": 542.75, "epoch": 0.9922394678492239, "grad_norm": 0.0, "kl": 0.8221780061721802, "learning_rate": 4.5504715283692224e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3580 }, { "completion_length": 481.0, "epoch": 0.9925166297117517, "grad_norm": 0.3683590888977051, "kl": 0.1370190531015396, "learning_rate": 4.550221039318169e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3581 }, { "completion_length": 508.25, "epoch": 0.9927937915742794, "grad_norm": 0.0, "kl": 0.1020168885588646, "learning_rate": 4.549970487395365e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3582 }, { "completion_length": 473.5, "epoch": 0.9930709534368071, "grad_norm": 0.3978689908981323, "kl": 0.12167824804782867, "learning_rate": 4.549719872608495e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3583 }, { "completion_length": 504.5, "epoch": 0.9933481152993349, "grad_norm": 0.0, "kl": 0.1253470629453659, "learning_rate": 4.5494691949652416e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3584 }, { "completion_length": 584.75, "epoch": 0.9936252771618626, "grad_norm": 0.0, "kl": 0.08938585966825485, "learning_rate": 4.549218454473294e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3585 }, { "completion_length": 469.0, "epoch": 0.9939024390243902, "grad_norm": 0.0, "kl": 0.14229300618171692, "learning_rate": 4.548967651140341e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3586 }, { "completion_length": 448.75, "epoch": 0.9941796008869179, "grad_norm": 0.34872451424598694, "kl": 0.12169509381055832, "learning_rate": 4.548716784974074e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3587 }, { "completion_length": 544.5, "epoch": 0.9944567627494457, "grad_norm": 0.0, "kl": 0.13458241522312164, "learning_rate": 4.548465855982186e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3588 }, { "completion_length": 476.25, "epoch": 0.9947339246119734, "grad_norm": 0.0, "kl": 0.12090805172920227, "learning_rate": 4.548214864172371e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3589 }, { "completion_length": 475.0, "epoch": 0.9950110864745011, "grad_norm": 0.0, "kl": 0.12808263301849365, "learning_rate": 4.547963809552326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3590 }, { "completion_length": 492.75, "epoch": 0.9952882483370288, "grad_norm": 0.0, "kl": 0.14565198123455048, "learning_rate": 4.547712692129751e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3591 }, { "completion_length": 512.0, "epoch": 0.9955654101995566, "grad_norm": 0.0, "kl": 0.121567502617836, "learning_rate": 4.547461511912346e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3592 }, { "completion_length": 514.0, "epoch": 0.9958425720620843, "grad_norm": 0.0, "kl": 0.11858668923377991, "learning_rate": 4.547210268907813e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3593 }, { "completion_length": 484.5, "epoch": 0.996119733924612, "grad_norm": 0.0, "kl": 0.100443534553051, "learning_rate": 4.5469589631238575e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3594 }, { "completion_length": 571.0, "epoch": 0.9963968957871396, "grad_norm": 0.0, "kl": 0.10035035014152527, "learning_rate": 4.546707594568186e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3595 }, { "completion_length": 540.25, "epoch": 0.9966740576496674, "grad_norm": 0.33248260617256165, "kl": 0.11222036182880402, "learning_rate": 4.5464561632485075e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3596 }, { "completion_length": 489.0, "epoch": 0.9969512195121951, "grad_norm": 0.3973288834095001, "kl": 0.13930094242095947, "learning_rate": 4.54620466917253e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3597 }, { "completion_length": 446.25, "epoch": 0.9972283813747228, "grad_norm": 0.4322092533111572, "kl": 0.11533568799495697, "learning_rate": 4.545953112347967e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3598 }, { "completion_length": 528.25, "epoch": 0.9975055432372506, "grad_norm": 0.3459514081478119, "kl": 0.11107542365789413, "learning_rate": 4.545701492782535e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3599 }, { "completion_length": 553.0, "epoch": 0.9977827050997783, "grad_norm": 0.3632090985774994, "kl": 0.10332595556974411, "learning_rate": 4.545449810483947e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3600 }, { "completion_length": 623.25, "epoch": 0.998059866962306, "grad_norm": 0.0, "kl": 0.19257596135139465, "learning_rate": 4.545198065459922e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3601 }, { "completion_length": 528.75, "epoch": 0.9983370288248337, "grad_norm": 0.3308175802230835, "kl": 0.16665604710578918, "learning_rate": 4.5449462577181805e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3602 }, { "completion_length": 535.5, "epoch": 0.9986141906873615, "grad_norm": 0.0, "kl": 0.1116691380739212, "learning_rate": 4.544694387266444e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3603 }, { "completion_length": 494.5, "epoch": 0.9988913525498891, "grad_norm": 0.0, "kl": 0.10801203548908234, "learning_rate": 4.544442454112436e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3604 }, { "completion_length": 448.25, "epoch": 0.9991685144124168, "grad_norm": 0.0, "kl": 0.14383140206336975, "learning_rate": 4.544190458263883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3605 }, { "completion_length": 529.5, "epoch": 0.9994456762749445, "grad_norm": 0.3748316466808319, "kl": 0.08824170380830765, "learning_rate": 4.543938399728512e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3606 }, { "completion_length": 462.25, "epoch": 0.9997228381374723, "grad_norm": 0.0, "kl": 0.14324462413787842, "learning_rate": 4.543686278514053e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3607 }, { "completion_length": 480.75, "epoch": 1.0, "grad_norm": 0.0, "kl": 0.12279342859983444, "learning_rate": 4.543434094628237e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3608 }, { "completion_length": 498.75, "epoch": 1.0002771618625277, "grad_norm": 0.0, "kl": 0.1247466430068016, "learning_rate": 4.543181848078798e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3609 }, { "completion_length": 445.5, "epoch": 1.0005543237250554, "grad_norm": 0.45610311627388, "kl": 0.17643478512763977, "learning_rate": 4.542929538873472e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3610 }, { "completion_length": 485.25, "epoch": 1.000831485587583, "grad_norm": 0.0, "kl": 0.12305372208356857, "learning_rate": 4.542677167019995e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3611 }, { "completion_length": 824.0, "epoch": 1.001108647450111, "grad_norm": 0.3283900022506714, "kl": 0.09751886129379272, "learning_rate": 4.542424732526105e-06, "loss": 0.0, "reward": 3.5625, "reward_std": 2.5443973541259766, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5625, "step": 3612 }, { "completion_length": 486.75, "epoch": 1.0013858093126387, "grad_norm": 0.0, "kl": 0.12864919006824493, "learning_rate": 4.542172235399547e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3613 }, { "completion_length": 473.25, "epoch": 1.0016629711751663, "grad_norm": 0.42663705348968506, "kl": 2.5826966762542725, "learning_rate": 4.54191967564806e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3614 }, { "completion_length": 452.25, "epoch": 1.001940133037694, "grad_norm": 0.43858596682548523, "kl": 0.20847462117671967, "learning_rate": 4.541667053279392e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3615 }, { "completion_length": 464.75, "epoch": 1.0022172949002217, "grad_norm": 0.3937021493911743, "kl": 0.12274004518985748, "learning_rate": 4.541414368301288e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3616 }, { "completion_length": 500.25, "epoch": 1.0024944567627494, "grad_norm": 0.3567981719970703, "kl": 0.1326838880777359, "learning_rate": 4.541161620721497e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3617 }, { "completion_length": 443.5, "epoch": 1.002771618625277, "grad_norm": 0.0, "kl": 0.1529778689146042, "learning_rate": 4.540908810547771e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3618 }, { "completion_length": 517.0, "epoch": 1.0030487804878048, "grad_norm": 0.0, "kl": 0.11944802105426788, "learning_rate": 4.540655937787861e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3619 }, { "completion_length": 535.25, "epoch": 1.0033259423503327, "grad_norm": 0.3690844774246216, "kl": 0.10641815513372421, "learning_rate": 4.540403002449522e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3620 }, { "completion_length": 461.75, "epoch": 1.0036031042128604, "grad_norm": 0.35807955265045166, "kl": 0.1335369050502777, "learning_rate": 4.5401500045405126e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3621 }, { "completion_length": 487.5, "epoch": 1.003880266075388, "grad_norm": 0.0, "kl": 0.11916373670101166, "learning_rate": 4.539896944068588e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3622 }, { "completion_length": 490.5, "epoch": 1.0041574279379157, "grad_norm": 0.0, "kl": 0.13213790953159332, "learning_rate": 4.539643821041511e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3623 }, { "completion_length": 536.25, "epoch": 1.0044345898004434, "grad_norm": 0.368326336145401, "kl": 0.12869445979595184, "learning_rate": 4.539390635467041e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3624 }, { "completion_length": 489.25, "epoch": 1.004711751662971, "grad_norm": 0.3705807030200958, "kl": 0.11153829097747803, "learning_rate": 4.539137387352945e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3625 }, { "completion_length": 466.0, "epoch": 1.0049889135254988, "grad_norm": 0.0, "kl": 0.10364408791065216, "learning_rate": 4.538884076706987e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3626 }, { "completion_length": 485.25, "epoch": 1.0052660753880267, "grad_norm": 0.0, "kl": 0.17561222612857819, "learning_rate": 4.538630703536937e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3627 }, { "completion_length": 558.5, "epoch": 1.0055432372505544, "grad_norm": 0.0, "kl": 0.1131015494465828, "learning_rate": 4.538377267850564e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3628 }, { "completion_length": 471.0, "epoch": 1.005820399113082, "grad_norm": 0.42942318320274353, "kl": 0.12038718163967133, "learning_rate": 4.538123769655638e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3629 }, { "completion_length": 463.5, "epoch": 1.0060975609756098, "grad_norm": 0.37502941489219666, "kl": 0.13111189007759094, "learning_rate": 4.537870208959935e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3630 }, { "completion_length": 456.5, "epoch": 1.0063747228381374, "grad_norm": 0.39827919006347656, "kl": 0.1212942898273468, "learning_rate": 4.537616585771231e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3631 }, { "completion_length": 471.0, "epoch": 1.0066518847006651, "grad_norm": 0.3565334379673004, "kl": 0.12242584675550461, "learning_rate": 4.537362900097302e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3632 }, { "completion_length": 460.0, "epoch": 1.0069290465631928, "grad_norm": 0.48001915216445923, "kl": 0.10907856374979019, "learning_rate": 4.5371091519459274e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3633 }, { "completion_length": 509.5, "epoch": 1.0072062084257207, "grad_norm": 0.0, "kl": 0.10547909140586853, "learning_rate": 4.536855341324891e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3634 }, { "completion_length": 456.0, "epoch": 1.0074833702882484, "grad_norm": 0.0, "kl": 0.12333415448665619, "learning_rate": 4.536601468241973e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3635 }, { "completion_length": 448.0, "epoch": 1.007760532150776, "grad_norm": 0.40773141384124756, "kl": 0.12702125310897827, "learning_rate": 4.53634753270496e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3636 }, { "completion_length": 488.75, "epoch": 1.0080376940133038, "grad_norm": 0.0, "kl": 0.10928747802972794, "learning_rate": 4.536093534721639e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3637 }, { "completion_length": 473.25, "epoch": 1.0083148558758315, "grad_norm": 0.3597131371498108, "kl": 0.407013475894928, "learning_rate": 4.5358394742998e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3638 }, { "completion_length": 572.0, "epoch": 1.0085920177383592, "grad_norm": 0.0, "kl": 0.16845542192459106, "learning_rate": 4.5355853514472335e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3639 }, { "completion_length": 551.25, "epoch": 1.0088691796008868, "grad_norm": 0.3457397222518921, "kl": 0.1175050288438797, "learning_rate": 4.535331166171731e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3640 }, { "completion_length": 572.25, "epoch": 1.0091463414634145, "grad_norm": 0.3763805329799652, "kl": 0.12215932458639145, "learning_rate": 4.53507691848109e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3641 }, { "completion_length": 479.25, "epoch": 1.0094235033259424, "grad_norm": 0.0, "kl": 0.14606696367263794, "learning_rate": 4.534822608383104e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3642 }, { "completion_length": 465.75, "epoch": 1.0097006651884701, "grad_norm": 0.0, "kl": 0.11576678603887558, "learning_rate": 4.534568235885574e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3643 }, { "completion_length": 484.25, "epoch": 1.0099778270509978, "grad_norm": 0.0, "kl": 0.14064781367778778, "learning_rate": 4.5343138009963e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3644 }, { "completion_length": 485.75, "epoch": 1.0102549889135255, "grad_norm": 0.31235864758491516, "kl": 0.12477721273899078, "learning_rate": 4.534059303723083e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3645 }, { "completion_length": 454.5, "epoch": 1.0105321507760532, "grad_norm": 0.4290020763874054, "kl": 0.1116780936717987, "learning_rate": 4.533804744073729e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3646 }, { "completion_length": 545.0, "epoch": 1.0108093126385809, "grad_norm": 0.0, "kl": 0.11064677685499191, "learning_rate": 4.533550122056045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3647 }, { "completion_length": 551.25, "epoch": 1.0110864745011086, "grad_norm": 0.32703304290771484, "kl": 0.10659793019294739, "learning_rate": 4.533295437677837e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3648 }, { "completion_length": 494.75, "epoch": 1.0113636363636365, "grad_norm": 0.3717033863067627, "kl": 0.138098806142807, "learning_rate": 4.533040690946917e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3649 }, { "completion_length": 510.5, "epoch": 1.0116407982261642, "grad_norm": 0.3643224537372589, "kl": 0.12770745158195496, "learning_rate": 4.532785881871095e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3650 }, { "completion_length": 532.25, "epoch": 1.0119179600886918, "grad_norm": 0.0, "kl": 0.0971364751458168, "learning_rate": 4.532531010458188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3651 }, { "completion_length": 465.75, "epoch": 1.0121951219512195, "grad_norm": 0.4684562385082245, "kl": 0.12395889312028885, "learning_rate": 4.5322760767160095e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3652 }, { "completion_length": 459.5, "epoch": 1.0124722838137472, "grad_norm": 0.4076966643333435, "kl": 0.1117626428604126, "learning_rate": 4.532021080652377e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3653 }, { "completion_length": 495.5, "epoch": 1.012749445676275, "grad_norm": 0.35357311367988586, "kl": 0.11407369375228882, "learning_rate": 4.531766022275112e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3654 }, { "completion_length": 546.0, "epoch": 1.0130266075388026, "grad_norm": 0.3618727922439575, "kl": 0.13024453818798065, "learning_rate": 4.531510901592035e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3655 }, { "completion_length": 411.5, "epoch": 1.0133037694013303, "grad_norm": 0.0, "kl": 0.11808247119188309, "learning_rate": 4.531255718610968e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3656 }, { "completion_length": 481.75, "epoch": 1.0135809312638582, "grad_norm": 0.38515737652778625, "kl": 0.11440172046422958, "learning_rate": 4.53100047333974e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3657 }, { "completion_length": 552.25, "epoch": 1.0138580931263859, "grad_norm": 0.30173617601394653, "kl": 0.11559973657131195, "learning_rate": 4.530745165786174e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3658 }, { "completion_length": 427.75, "epoch": 1.0141352549889135, "grad_norm": 0.0, "kl": 0.11824274063110352, "learning_rate": 4.530489795958103e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3659 }, { "completion_length": 489.5, "epoch": 1.0144124168514412, "grad_norm": 0.0, "kl": 0.11567609757184982, "learning_rate": 4.530234363863356e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3660 }, { "completion_length": 439.75, "epoch": 1.014689578713969, "grad_norm": 0.3688659965991974, "kl": 0.19654683768749237, "learning_rate": 4.529978869509767e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3661 }, { "completion_length": 531.0, "epoch": 1.0149667405764966, "grad_norm": 0.0, "kl": 0.13877888023853302, "learning_rate": 4.529723312905171e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3662 }, { "completion_length": 473.0, "epoch": 1.0152439024390243, "grad_norm": 0.0, "kl": 0.12323112040758133, "learning_rate": 4.529467694057404e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3663 }, { "completion_length": 467.5, "epoch": 1.0155210643015522, "grad_norm": 0.4073721468448639, "kl": 0.1288500875234604, "learning_rate": 4.5292120129743044e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3664 }, { "completion_length": 536.25, "epoch": 1.0157982261640799, "grad_norm": 0.0, "kl": 0.11973073333501816, "learning_rate": 4.5289562696637145e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3665 }, { "completion_length": 524.75, "epoch": 1.0160753880266076, "grad_norm": 0.0, "kl": 0.13572661578655243, "learning_rate": 4.528700464133475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3666 }, { "completion_length": 511.5, "epoch": 1.0163525498891353, "grad_norm": 0.0, "kl": 0.13311894237995148, "learning_rate": 4.528444596391433e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3667 }, { "completion_length": 529.25, "epoch": 1.016629711751663, "grad_norm": 0.0, "kl": 0.10346897691488266, "learning_rate": 4.528188666445432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3668 }, { "completion_length": 499.0, "epoch": 1.0169068736141906, "grad_norm": 0.0, "kl": 0.13712149858474731, "learning_rate": 4.5279326743033225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3669 }, { "completion_length": 433.0, "epoch": 1.0171840354767183, "grad_norm": 0.0, "kl": 0.10901611298322678, "learning_rate": 4.5276766199729536e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3670 }, { "completion_length": 522.25, "epoch": 1.0174611973392462, "grad_norm": 0.0, "kl": 0.11434989422559738, "learning_rate": 4.527420503462178e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3671 }, { "completion_length": 502.75, "epoch": 1.017738359201774, "grad_norm": 0.0, "kl": 0.14087854325771332, "learning_rate": 4.5271643247788496e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3672 }, { "completion_length": 507.25, "epoch": 1.0180155210643016, "grad_norm": 0.0, "kl": 0.11551059037446976, "learning_rate": 4.5269080839308236e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3673 }, { "completion_length": 494.0, "epoch": 1.0182926829268293, "grad_norm": 0.0, "kl": 0.11642567813396454, "learning_rate": 4.526651780925959e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3674 }, { "completion_length": 397.5, "epoch": 1.018569844789357, "grad_norm": 0.424242228269577, "kl": 0.12988291680812836, "learning_rate": 4.526395415772114e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3675 }, { "completion_length": 478.5, "epoch": 1.0188470066518847, "grad_norm": 0.392851859331131, "kl": 0.13601574301719666, "learning_rate": 4.526138988477152e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3676 }, { "completion_length": 515.75, "epoch": 1.0191241685144123, "grad_norm": 0.0, "kl": 0.12994587421417236, "learning_rate": 4.525882499048935e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3677 }, { "completion_length": 511.5, "epoch": 1.01940133037694, "grad_norm": 0.4391709268093109, "kl": 0.1120348870754242, "learning_rate": 4.52562594749533e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3678 }, { "completion_length": 537.0, "epoch": 1.019678492239468, "grad_norm": 0.3659379184246063, "kl": 2.0655152797698975, "learning_rate": 4.525369333824203e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3679 }, { "completion_length": 544.25, "epoch": 1.0199556541019956, "grad_norm": 0.42612990736961365, "kl": 0.12247325479984283, "learning_rate": 4.525112658043424e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3680 }, { "completion_length": 495.25, "epoch": 1.0202328159645233, "grad_norm": 0.0, "kl": 0.1351253241300583, "learning_rate": 4.524855920160864e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3681 }, { "completion_length": 634.75, "epoch": 1.020509977827051, "grad_norm": 0.2960275709629059, "kl": 0.10261962562799454, "learning_rate": 4.5245991201843966e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3682 }, { "completion_length": 483.25, "epoch": 1.0207871396895787, "grad_norm": 0.3611297607421875, "kl": 0.14741559326648712, "learning_rate": 4.5243422581218955e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3683 }, { "completion_length": 498.5, "epoch": 1.0210643015521064, "grad_norm": 0.4549257457256317, "kl": 0.11641241610050201, "learning_rate": 4.524085333981239e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3684 }, { "completion_length": 531.25, "epoch": 1.021341463414634, "grad_norm": 0.0, "kl": 0.15812067687511444, "learning_rate": 4.5238283477703055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3685 }, { "completion_length": 467.0, "epoch": 1.021618625277162, "grad_norm": 0.39528709650039673, "kl": 0.15178370475769043, "learning_rate": 4.523571299496975e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3686 }, { "completion_length": 542.75, "epoch": 1.0218957871396896, "grad_norm": 0.0, "kl": 0.11440707743167877, "learning_rate": 4.5233141891691305e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3687 }, { "completion_length": 560.75, "epoch": 1.0221729490022173, "grad_norm": 0.3239001929759979, "kl": 0.10939743369817734, "learning_rate": 4.5230570167946565e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3688 }, { "completion_length": 485.75, "epoch": 1.022450110864745, "grad_norm": 0.4017375707626343, "kl": 0.11457718163728714, "learning_rate": 4.5227997823814405e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3689 }, { "completion_length": 471.25, "epoch": 1.0227272727272727, "grad_norm": 0.3848221004009247, "kl": 0.1135508269071579, "learning_rate": 4.522542485937369e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3690 }, { "completion_length": 547.75, "epoch": 1.0230044345898004, "grad_norm": 0.29137545824050903, "kl": 0.1242850124835968, "learning_rate": 4.522285127470333e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3691 }, { "completion_length": 480.0, "epoch": 1.023281596452328, "grad_norm": 0.3838277757167816, "kl": 0.08540359139442444, "learning_rate": 4.5220277069882244e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3692 }, { "completion_length": 466.75, "epoch": 1.0235587583148558, "grad_norm": 0.0, "kl": 0.10863189399242401, "learning_rate": 4.521770224498938e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3693 }, { "completion_length": 462.25, "epoch": 1.0238359201773837, "grad_norm": 0.356948584318161, "kl": 0.11808702349662781, "learning_rate": 4.5215126800103695e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3694 }, { "completion_length": 451.5, "epoch": 1.0241130820399114, "grad_norm": 0.0, "kl": 0.12319056689739227, "learning_rate": 4.5212550735304165e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3695 }, { "completion_length": 464.5, "epoch": 1.024390243902439, "grad_norm": 0.0, "kl": 0.11892674118280411, "learning_rate": 4.520997405066977e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3696 }, { "completion_length": 477.5, "epoch": 1.0246674057649667, "grad_norm": 0.0, "kl": 0.13914312422275543, "learning_rate": 4.520739674627955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3697 }, { "completion_length": 489.75, "epoch": 1.0249445676274944, "grad_norm": 0.0, "kl": 0.24868176877498627, "learning_rate": 4.520481882221254e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3698 }, { "completion_length": 392.0, "epoch": 1.025221729490022, "grad_norm": 0.0, "kl": 0.13902899622917175, "learning_rate": 4.520224027854779e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3699 }, { "completion_length": 522.25, "epoch": 1.0254988913525498, "grad_norm": 0.0, "kl": 0.1073882207274437, "learning_rate": 4.519966111536436e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3700 }, { "completion_length": 478.75, "epoch": 1.0257760532150777, "grad_norm": 0.0, "kl": 0.14293330907821655, "learning_rate": 4.5197081332741345e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3701 }, { "completion_length": 519.25, "epoch": 1.0260532150776054, "grad_norm": 0.30991122126579285, "kl": 0.11731518805027008, "learning_rate": 4.519450093075787e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3702 }, { "completion_length": 547.25, "epoch": 1.026330376940133, "grad_norm": 0.0, "kl": 0.12538689374923706, "learning_rate": 4.519191990949307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3703 }, { "completion_length": 546.75, "epoch": 1.0266075388026608, "grad_norm": 0.3516770601272583, "kl": 0.11225517839193344, "learning_rate": 4.518933826902607e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3704 }, { "completion_length": 500.25, "epoch": 1.0268847006651884, "grad_norm": 0.3601730167865753, "kl": 0.126077339053154, "learning_rate": 4.518675600943605e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3705 }, { "completion_length": 552.0, "epoch": 1.0271618625277161, "grad_norm": 0.0, "kl": 0.13380573689937592, "learning_rate": 4.51841731308022e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3706 }, { "completion_length": 494.5, "epoch": 1.0274390243902438, "grad_norm": 0.32629677653312683, "kl": 0.12081395089626312, "learning_rate": 4.518158963320373e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3707 }, { "completion_length": 492.0, "epoch": 1.0277161862527717, "grad_norm": 0.0, "kl": 0.1259414702653885, "learning_rate": 4.5179005516719855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3708 }, { "completion_length": 447.25, "epoch": 1.0279933481152994, "grad_norm": 0.0, "kl": 0.4704459011554718, "learning_rate": 4.517642078142983e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3709 }, { "completion_length": 489.0, "epoch": 1.028270509977827, "grad_norm": 0.40076062083244324, "kl": 0.10765092074871063, "learning_rate": 4.51738354274129e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3710 }, { "completion_length": 571.25, "epoch": 1.0285476718403548, "grad_norm": 0.3270873725414276, "kl": 0.10758443176746368, "learning_rate": 4.517124945474836e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3711 }, { "completion_length": 579.25, "epoch": 1.0288248337028825, "grad_norm": 0.0, "kl": 0.1265604943037033, "learning_rate": 4.5168662863515515e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3712 }, { "completion_length": 521.5, "epoch": 1.0291019955654102, "grad_norm": 0.35773563385009766, "kl": 0.35312777757644653, "learning_rate": 4.516607565379368e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3713 }, { "completion_length": 458.5, "epoch": 1.0293791574279378, "grad_norm": 0.4239896535873413, "kl": 0.1343494951725006, "learning_rate": 4.516348782566219e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3714 }, { "completion_length": 505.0, "epoch": 1.0296563192904655, "grad_norm": 0.4090094566345215, "kl": 0.10050231218338013, "learning_rate": 4.5160899379200405e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3715 }, { "completion_length": 473.75, "epoch": 1.0299334811529934, "grad_norm": 0.3707987666130066, "kl": 0.1227286159992218, "learning_rate": 4.51583103144877e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3716 }, { "completion_length": 561.75, "epoch": 1.0302106430155211, "grad_norm": 0.3379131853580475, "kl": 0.0991741195321083, "learning_rate": 4.515572063160349e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3717 }, { "completion_length": 478.0, "epoch": 1.0304878048780488, "grad_norm": 0.0, "kl": 0.130818709731102, "learning_rate": 4.515313033062716e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3718 }, { "completion_length": 537.0, "epoch": 1.0307649667405765, "grad_norm": 0.31072497367858887, "kl": 0.1100325882434845, "learning_rate": 4.515053941163817e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3719 }, { "completion_length": 524.0, "epoch": 1.0310421286031042, "grad_norm": 0.38538581132888794, "kl": 0.1988338828086853, "learning_rate": 4.514794787471595e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3720 }, { "completion_length": 527.0, "epoch": 1.0313192904656319, "grad_norm": 0.0, "kl": 0.12074831873178482, "learning_rate": 4.514535571993998e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3721 }, { "completion_length": 448.0, "epoch": 1.0315964523281596, "grad_norm": 0.0, "kl": 0.1146247535943985, "learning_rate": 4.514276294738976e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3722 }, { "completion_length": 547.25, "epoch": 1.0318736141906875, "grad_norm": 0.0, "kl": 0.12029880285263062, "learning_rate": 4.514016955714478e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3723 }, { "completion_length": 477.75, "epoch": 1.0321507760532151, "grad_norm": 0.0, "kl": 0.1528470814228058, "learning_rate": 4.51375755492846e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3724 }, { "completion_length": 554.25, "epoch": 1.0324279379157428, "grad_norm": 0.0, "kl": 0.1829422265291214, "learning_rate": 4.513498092388874e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3725 }, { "completion_length": 524.75, "epoch": 1.0327050997782705, "grad_norm": 0.0, "kl": 0.1321963369846344, "learning_rate": 4.513238568103676e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3726 }, { "completion_length": 503.25, "epoch": 1.0329822616407982, "grad_norm": 0.0, "kl": 0.11378663778305054, "learning_rate": 4.512978982080828e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3727 }, { "completion_length": 548.25, "epoch": 1.033259423503326, "grad_norm": 0.31249338388442993, "kl": 0.3436334431171417, "learning_rate": 4.512719334328287e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3728 }, { "completion_length": 564.75, "epoch": 1.0335365853658536, "grad_norm": 0.0, "kl": 0.14506368339061737, "learning_rate": 4.512459624854017e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3729 }, { "completion_length": 530.5, "epoch": 1.0338137472283813, "grad_norm": 0.0, "kl": 0.12554733455181122, "learning_rate": 4.512199853665983e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3730 }, { "completion_length": 528.75, "epoch": 1.0340909090909092, "grad_norm": 0.38053566217422485, "kl": 0.11767446249723434, "learning_rate": 4.51194002077215e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3731 }, { "completion_length": 528.25, "epoch": 1.0343680709534369, "grad_norm": 0.33536526560783386, "kl": 0.08742257207632065, "learning_rate": 4.5116801261804846e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3732 }, { "completion_length": 496.25, "epoch": 1.0346452328159645, "grad_norm": 0.35459640622138977, "kl": 0.12970061600208282, "learning_rate": 4.511420169898959e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3733 }, { "completion_length": 528.0, "epoch": 1.0349223946784922, "grad_norm": 0.0, "kl": 0.11278904229402542, "learning_rate": 4.511160151935544e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3734 }, { "completion_length": 700.5, "epoch": 1.03519955654102, "grad_norm": 0.24598067998886108, "kl": 0.0863664373755455, "learning_rate": 4.510900072298213e-06, "loss": -0.0, "reward": 2.625, "reward_std": 1.75, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.625, "step": 3735 }, { "completion_length": 494.75, "epoch": 1.0354767184035476, "grad_norm": 0.3321184813976288, "kl": 0.11240523308515549, "learning_rate": 4.510639930994942e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3736 }, { "completion_length": 515.25, "epoch": 1.0357538802660753, "grad_norm": 0.0, "kl": 0.13079966604709625, "learning_rate": 4.5103797280337096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3737 }, { "completion_length": 455.0, "epoch": 1.0360310421286032, "grad_norm": 0.0, "kl": 0.13625049591064453, "learning_rate": 4.510119463422493e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3738 }, { "completion_length": 543.0, "epoch": 1.0363082039911309, "grad_norm": 0.3153694272041321, "kl": 0.0992172360420227, "learning_rate": 4.509859137169274e-06, "loss": -0.0, "reward": 2.09375, "reward_std": 2.7336158752441406, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 3739 }, { "completion_length": 434.75, "epoch": 1.0365853658536586, "grad_norm": 0.0, "kl": 0.11487830430269241, "learning_rate": 4.509598749282036e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3740 }, { "completion_length": 501.5, "epoch": 1.0368625277161863, "grad_norm": 0.31715404987335205, "kl": 0.10292330384254456, "learning_rate": 4.509338299768765e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3741 }, { "completion_length": 543.75, "epoch": 1.037139689578714, "grad_norm": 0.0, "kl": 0.11675607413053513, "learning_rate": 4.509077788637446e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3742 }, { "completion_length": 551.5, "epoch": 1.0374168514412416, "grad_norm": 0.0, "kl": 0.1070961058139801, "learning_rate": 4.508817215896069e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3743 }, { "completion_length": 495.0, "epoch": 1.0376940133037693, "grad_norm": 0.0, "kl": 0.12815158069133759, "learning_rate": 4.508556581552624e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3744 }, { "completion_length": 489.25, "epoch": 1.0379711751662972, "grad_norm": 0.4242716133594513, "kl": 0.1321251392364502, "learning_rate": 4.508295885615105e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3745 }, { "completion_length": 514.5, "epoch": 1.038248337028825, "grad_norm": 0.3353440761566162, "kl": 0.10157080739736557, "learning_rate": 4.508035128091506e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3746 }, { "completion_length": 496.5, "epoch": 1.0385254988913526, "grad_norm": 0.38317859172821045, "kl": 0.13085229694843292, "learning_rate": 4.507774308989822e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3747 }, { "completion_length": 567.0, "epoch": 1.0388026607538803, "grad_norm": 0.0, "kl": 0.09766516834497452, "learning_rate": 4.507513428318052e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3748 }, { "completion_length": 486.0, "epoch": 1.039079822616408, "grad_norm": 0.3670582175254822, "kl": 0.10556977987289429, "learning_rate": 4.507252486084196e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3749 }, { "completion_length": 588.25, "epoch": 1.0393569844789357, "grad_norm": 0.28256097435951233, "kl": 0.126109316945076, "learning_rate": 4.506991482296256e-06, "loss": -0.0, "reward": 2.71875, "reward_std": 2.0216922760009766, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 3750 }, { "completion_length": 461.25, "epoch": 1.0396341463414633, "grad_norm": 0.0, "kl": 0.12683269381523132, "learning_rate": 4.506730416962237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3751 }, { "completion_length": 447.75, "epoch": 1.039911308203991, "grad_norm": 0.0, "kl": 0.1255490779876709, "learning_rate": 4.506469290090144e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3752 }, { "completion_length": 618.0, "epoch": 1.040188470066519, "grad_norm": 0.0, "kl": 0.09724785387516022, "learning_rate": 4.506208101687984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3753 }, { "completion_length": 557.75, "epoch": 1.0404656319290466, "grad_norm": 0.0, "kl": 0.10918258875608444, "learning_rate": 4.505946851763766e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3754 }, { "completion_length": 855.75, "epoch": 1.0407427937915743, "grad_norm": 0.0, "kl": 0.09681164473295212, "learning_rate": 4.505685540325504e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3755 }, { "completion_length": 540.5, "epoch": 1.041019955654102, "grad_norm": 0.38364556431770325, "kl": 0.2645179331302643, "learning_rate": 4.505424167381211e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3756 }, { "completion_length": 509.75, "epoch": 1.0412971175166297, "grad_norm": 0.35553765296936035, "kl": 0.15446460247039795, "learning_rate": 4.505162732938899e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3757 }, { "completion_length": 619.0, "epoch": 1.0415742793791574, "grad_norm": 0.0, "kl": 0.25527235865592957, "learning_rate": 4.504901237006588e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3758 }, { "completion_length": 509.75, "epoch": 1.041851441241685, "grad_norm": 0.3524342179298401, "kl": 0.1286095380783081, "learning_rate": 4.504639679592297e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3759 }, { "completion_length": 568.75, "epoch": 1.042128603104213, "grad_norm": 0.3147577941417694, "kl": 0.1081058457493782, "learning_rate": 4.504378060704045e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3760 }, { "completion_length": 496.0, "epoch": 1.0424057649667406, "grad_norm": 0.393310546875, "kl": 0.11096800863742828, "learning_rate": 4.504116380349856e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3761 }, { "completion_length": 509.5, "epoch": 1.0426829268292683, "grad_norm": 0.0, "kl": 0.09825430810451508, "learning_rate": 4.503854638537756e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3762 }, { "completion_length": 524.25, "epoch": 1.042960088691796, "grad_norm": 0.0, "kl": 0.12342607975006104, "learning_rate": 4.5035928352757684e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3763 }, { "completion_length": 512.25, "epoch": 1.0432372505543237, "grad_norm": 0.0, "kl": 0.11494326591491699, "learning_rate": 4.503330970571924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3764 }, { "completion_length": 498.5, "epoch": 1.0435144124168514, "grad_norm": 0.39192894101142883, "kl": 0.11999844759702682, "learning_rate": 4.503069044434252e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3765 }, { "completion_length": 553.5, "epoch": 1.043791574279379, "grad_norm": 0.3709873557090759, "kl": 0.11270590126514435, "learning_rate": 4.502807056870785e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3766 }, { "completion_length": 480.0, "epoch": 1.0440687361419068, "grad_norm": 0.3704870939254761, "kl": 0.12986427545547485, "learning_rate": 4.502545007889557e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3767 }, { "completion_length": 519.75, "epoch": 1.0443458980044347, "grad_norm": 0.0, "kl": 0.13369397819042206, "learning_rate": 4.5022828974986044e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3768 }, { "completion_length": 497.5, "epoch": 1.0446230598669624, "grad_norm": 0.0, "kl": 0.11410262435674667, "learning_rate": 4.502020725705965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3769 }, { "completion_length": 494.75, "epoch": 1.04490022172949, "grad_norm": 0.38752487301826477, "kl": 0.17524392902851105, "learning_rate": 4.501758492519678e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3770 }, { "completion_length": 469.25, "epoch": 1.0451773835920177, "grad_norm": 0.0, "kl": 0.1775536686182022, "learning_rate": 4.501496197947785e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3771 }, { "completion_length": 451.5, "epoch": 1.0454545454545454, "grad_norm": 0.0, "kl": 0.09704200178384781, "learning_rate": 4.501233841998328e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3772 }, { "completion_length": 561.75, "epoch": 1.045731707317073, "grad_norm": 0.3308154344558716, "kl": 0.10936566442251205, "learning_rate": 4.5009714246793554e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3773 }, { "completion_length": 483.25, "epoch": 1.0460088691796008, "grad_norm": 0.0, "kl": 0.1171731948852539, "learning_rate": 4.500708945998913e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3774 }, { "completion_length": 566.75, "epoch": 1.0462860310421287, "grad_norm": 0.3159599006175995, "kl": 0.13264518976211548, "learning_rate": 4.500446405965051e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3775 }, { "completion_length": 536.25, "epoch": 1.0465631929046564, "grad_norm": 0.0, "kl": 0.11816377192735672, "learning_rate": 4.500183804585818e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3776 }, { "completion_length": 528.0, "epoch": 1.046840354767184, "grad_norm": 0.33862918615341187, "kl": 0.11441435664892197, "learning_rate": 4.499921141869268e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3777 }, { "completion_length": 510.5, "epoch": 1.0471175166297118, "grad_norm": 0.0, "kl": 0.10828141868114471, "learning_rate": 4.499658417823457e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3778 }, { "completion_length": 502.75, "epoch": 1.0473946784922394, "grad_norm": 0.0, "kl": 0.15477387607097626, "learning_rate": 4.4993956324564405e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3779 }, { "completion_length": 457.25, "epoch": 1.0476718403547671, "grad_norm": 0.3840961158275604, "kl": 0.16239981353282928, "learning_rate": 4.4991327857762765e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3780 }, { "completion_length": 516.5, "epoch": 1.0479490022172948, "grad_norm": 0.0, "kl": 0.12917570769786835, "learning_rate": 4.498869877791026e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3781 }, { "completion_length": 528.75, "epoch": 1.0482261640798227, "grad_norm": 0.0, "kl": 0.10666517168283463, "learning_rate": 4.498606908508754e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3782 }, { "completion_length": 539.5, "epoch": 1.0485033259423504, "grad_norm": 0.0, "kl": 0.1862720102071762, "learning_rate": 4.498343877937519e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3783 }, { "completion_length": 495.75, "epoch": 1.048780487804878, "grad_norm": 0.35000672936439514, "kl": 0.12264171987771988, "learning_rate": 4.498080786085392e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3784 }, { "completion_length": 459.75, "epoch": 1.0490576496674058, "grad_norm": 0.43901070952415466, "kl": 0.12254726886749268, "learning_rate": 4.497817632960439e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3785 }, { "completion_length": 520.5, "epoch": 1.0493348115299335, "grad_norm": 0.0, "kl": 0.1123451441526413, "learning_rate": 4.4975544185707295e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3786 }, { "completion_length": 456.0, "epoch": 1.0496119733924612, "grad_norm": 0.37231096625328064, "kl": 0.10779762268066406, "learning_rate": 4.497291142924335e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3787 }, { "completion_length": 431.75, "epoch": 1.0498891352549888, "grad_norm": 0.0, "kl": 0.12514646351337433, "learning_rate": 4.497027806029331e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3788 }, { "completion_length": 429.75, "epoch": 1.0501662971175165, "grad_norm": 0.3932294547557831, "kl": 0.1638864278793335, "learning_rate": 4.49676440789379e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3789 }, { "completion_length": 509.5, "epoch": 1.0504434589800444, "grad_norm": 0.4088635742664337, "kl": 0.12782172858715057, "learning_rate": 4.496500948525792e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3790 }, { "completion_length": 476.5, "epoch": 1.0507206208425721, "grad_norm": 0.0, "kl": 0.13675987720489502, "learning_rate": 4.496237427933416e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3791 }, { "completion_length": 520.5, "epoch": 1.0509977827050998, "grad_norm": 0.3669690787792206, "kl": 0.17647969722747803, "learning_rate": 4.49597384612474e-06, "loss": 0.0, "reward": 4.59375, "reward_std": 1.9185905456542969, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 3792 }, { "completion_length": 516.25, "epoch": 1.0512749445676275, "grad_norm": 0.0, "kl": 0.12790320813655853, "learning_rate": 4.49571020310785e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3793 }, { "completion_length": 441.5, "epoch": 1.0515521064301552, "grad_norm": 0.0, "kl": 0.1465827077627182, "learning_rate": 4.4954464988908306e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3794 }, { "completion_length": 474.0, "epoch": 1.0518292682926829, "grad_norm": 0.0, "kl": 0.19067399203777313, "learning_rate": 4.4951827334817675e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3795 }, { "completion_length": 520.5, "epoch": 1.0521064301552105, "grad_norm": 0.4357389807701111, "kl": 0.13042202591896057, "learning_rate": 4.49491890688875e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3796 }, { "completion_length": 908.0, "epoch": 1.0523835920177385, "grad_norm": 0.22138965129852295, "kl": 0.08877094835042953, "learning_rate": 4.494655019119869e-06, "loss": 0.0, "reward": 5.71875, "reward_std": 0.0625, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 3797 }, { "completion_length": 479.75, "epoch": 1.0526607538802661, "grad_norm": 0.0, "kl": 0.096966952085495, "learning_rate": 4.494391070183215e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3798 }, { "completion_length": 442.25, "epoch": 1.0529379157427938, "grad_norm": 0.0, "kl": 0.13913576304912567, "learning_rate": 4.494127060086884e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3799 }, { "completion_length": 499.5, "epoch": 1.0532150776053215, "grad_norm": 0.3312676250934601, "kl": 0.12551617622375488, "learning_rate": 4.49386298883897e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3800 }, { "completion_length": 481.25, "epoch": 1.0534922394678492, "grad_norm": 0.35174062848091125, "kl": 0.16238342225551605, "learning_rate": 4.4935988564475745e-06, "loss": 0.0, "reward": 1.625, "reward_std": 0.25, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3801 }, { "completion_length": 505.5, "epoch": 1.0537694013303769, "grad_norm": 0.0, "kl": 0.09795106202363968, "learning_rate": 4.493334662920794e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3802 }, { "completion_length": 482.25, "epoch": 1.0540465631929046, "grad_norm": 0.3652992248535156, "kl": 0.13305577635765076, "learning_rate": 4.493070408266732e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3803 }, { "completion_length": 464.25, "epoch": 1.0543237250554323, "grad_norm": 0.0, "kl": 0.12851133942604065, "learning_rate": 4.492806092493492e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3804 }, { "completion_length": 465.0, "epoch": 1.0546008869179602, "grad_norm": 0.4013800621032715, "kl": 0.1530570685863495, "learning_rate": 4.492541715609177e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3805 }, { "completion_length": 449.5, "epoch": 1.0548780487804879, "grad_norm": 0.0, "kl": 0.15580664575099945, "learning_rate": 4.492277277621898e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3806 }, { "completion_length": 492.25, "epoch": 1.0551552106430155, "grad_norm": 0.0, "kl": 0.13134711980819702, "learning_rate": 4.4920127785397615e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3807 }, { "completion_length": 452.25, "epoch": 1.0554323725055432, "grad_norm": 0.3922896981239319, "kl": 0.1622426062822342, "learning_rate": 4.49174821837088e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3808 }, { "completion_length": 453.5, "epoch": 1.055709534368071, "grad_norm": 0.0, "kl": 0.13911853730678558, "learning_rate": 4.491483597123368e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3809 }, { "completion_length": 627.5, "epoch": 1.0559866962305986, "grad_norm": 0.0, "kl": 0.13901561498641968, "learning_rate": 4.491218914805336e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3810 }, { "completion_length": 512.75, "epoch": 1.0562638580931263, "grad_norm": 0.0, "kl": 0.14835338294506073, "learning_rate": 4.490954171424904e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3811 }, { "completion_length": 526.75, "epoch": 1.0565410199556542, "grad_norm": 0.0, "kl": 0.11037316173315048, "learning_rate": 4.49068936699019e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3812 }, { "completion_length": 557.25, "epoch": 1.0568181818181819, "grad_norm": 0.0, "kl": 0.13776494562625885, "learning_rate": 4.490424501509313e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3813 }, { "completion_length": 499.0, "epoch": 1.0570953436807096, "grad_norm": 0.3488932251930237, "kl": 0.15890620648860931, "learning_rate": 4.490159574990398e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3814 }, { "completion_length": 690.25, "epoch": 1.0573725055432373, "grad_norm": 0.3112681210041046, "kl": 0.0957956463098526, "learning_rate": 4.4898945874415675e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3815 }, { "completion_length": 522.0, "epoch": 1.057649667405765, "grad_norm": 0.0, "kl": 0.10182648152112961, "learning_rate": 4.489629538870947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3816 }, { "completion_length": 551.25, "epoch": 1.0579268292682926, "grad_norm": 0.0, "kl": 0.13071231544017792, "learning_rate": 4.489364429286666e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3817 }, { "completion_length": 532.75, "epoch": 1.0582039911308203, "grad_norm": 0.38485899567604065, "kl": 0.10485420376062393, "learning_rate": 4.489099258696853e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3818 }, { "completion_length": 560.75, "epoch": 1.0584811529933482, "grad_norm": 0.0, "kl": 0.1451009213924408, "learning_rate": 4.48883402710964e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3819 }, { "completion_length": 508.25, "epoch": 1.058758314855876, "grad_norm": 0.0, "kl": 0.11869219690561295, "learning_rate": 4.488568734533161e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3820 }, { "completion_length": 518.0, "epoch": 1.0590354767184036, "grad_norm": 0.3597796559333801, "kl": 0.09058818966150284, "learning_rate": 4.488303380975551e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3821 }, { "completion_length": 450.5, "epoch": 1.0593126385809313, "grad_norm": 0.39978480339050293, "kl": 0.1504981815814972, "learning_rate": 4.488037966444948e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3822 }, { "completion_length": 588.25, "epoch": 1.059589800443459, "grad_norm": 0.3405757546424866, "kl": 0.12418486922979355, "learning_rate": 4.48777249094949e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3823 }, { "completion_length": 584.25, "epoch": 1.0598669623059866, "grad_norm": 0.3201026916503906, "kl": 0.08473213016986847, "learning_rate": 4.487506954497318e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3824 }, { "completion_length": 616.5, "epoch": 1.0601441241685143, "grad_norm": 0.0, "kl": 0.09261351823806763, "learning_rate": 4.487241357096577e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3825 }, { "completion_length": 551.0, "epoch": 1.0604212860310422, "grad_norm": 0.0, "kl": 0.105493925511837, "learning_rate": 4.4869756987554095e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3826 }, { "completion_length": 493.0, "epoch": 1.06069844789357, "grad_norm": 0.0, "kl": 0.13170751929283142, "learning_rate": 4.486709979481963e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3827 }, { "completion_length": 467.25, "epoch": 1.0609756097560976, "grad_norm": 0.0, "kl": 0.13150367140769958, "learning_rate": 4.486444199284386e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3828 }, { "completion_length": 515.75, "epoch": 1.0612527716186253, "grad_norm": 0.3607332110404968, "kl": 0.09946133196353912, "learning_rate": 4.486178358170829e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3829 }, { "completion_length": 510.5, "epoch": 1.061529933481153, "grad_norm": 0.0, "kl": 0.18065480887889862, "learning_rate": 4.485912456149444e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3830 }, { "completion_length": 548.75, "epoch": 1.0618070953436807, "grad_norm": 0.386890172958374, "kl": 0.11143502593040466, "learning_rate": 4.485646493228385e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3831 }, { "completion_length": 535.0, "epoch": 1.0620842572062084, "grad_norm": 0.34687143564224243, "kl": 0.12002518028020859, "learning_rate": 4.485380469415807e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3832 }, { "completion_length": 560.5, "epoch": 1.062361419068736, "grad_norm": 0.0, "kl": 0.09920576959848404, "learning_rate": 4.4851143847198706e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3833 }, { "completion_length": 542.0, "epoch": 1.062638580931264, "grad_norm": 0.0, "kl": 0.10950595885515213, "learning_rate": 4.484848239148734e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3834 }, { "completion_length": 846.5, "epoch": 1.0629157427937916, "grad_norm": 0.38870084285736084, "kl": 0.0903952494263649, "learning_rate": 4.484582032710558e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3835 }, { "completion_length": 549.0, "epoch": 1.0631929046563193, "grad_norm": 0.0, "kl": 0.10502979904413223, "learning_rate": 4.484315765413507e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3836 }, { "completion_length": 897.25, "epoch": 1.063470066518847, "grad_norm": 0.3170143961906433, "kl": 0.10001754015684128, "learning_rate": 4.484049437265746e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3837 }, { "completion_length": 517.75, "epoch": 1.0637472283813747, "grad_norm": 0.31858697533607483, "kl": 0.11122477799654007, "learning_rate": 4.483783048275442e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3838 }, { "completion_length": 551.5, "epoch": 1.0640243902439024, "grad_norm": 0.3331061601638794, "kl": 0.09513215720653534, "learning_rate": 4.483516598450764e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3839 }, { "completion_length": 477.0, "epoch": 1.06430155210643, "grad_norm": 0.32502543926239014, "kl": 0.11830676347017288, "learning_rate": 4.4832500877998835e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3840 }, { "completion_length": 499.75, "epoch": 1.0645787139689578, "grad_norm": 0.0, "kl": 0.1226162537932396, "learning_rate": 4.482983516330972e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3841 }, { "completion_length": 604.0, "epoch": 1.0648558758314857, "grad_norm": 0.3288871943950653, "kl": 0.09814392030239105, "learning_rate": 4.482716884052207e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3842 }, { "completion_length": 692.5, "epoch": 1.0651330376940134, "grad_norm": 0.0, "kl": 0.08252900838851929, "learning_rate": 4.482450190971762e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3843 }, { "completion_length": 513.0, "epoch": 1.065410199556541, "grad_norm": 0.0, "kl": 0.1542436182498932, "learning_rate": 4.482183437097815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3844 }, { "completion_length": 501.75, "epoch": 1.0656873614190687, "grad_norm": 0.0, "kl": 0.08536940068006516, "learning_rate": 4.48191662243855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3845 }, { "completion_length": 548.75, "epoch": 1.0659645232815964, "grad_norm": 0.48176926374435425, "kl": 0.13716192543506622, "learning_rate": 4.481649747002146e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3846 }, { "completion_length": 482.75, "epoch": 1.066241685144124, "grad_norm": 0.0, "kl": 0.13961496949195862, "learning_rate": 4.481382810796787e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3847 }, { "completion_length": 610.0, "epoch": 1.0665188470066518, "grad_norm": 0.0, "kl": 0.10693168640136719, "learning_rate": 4.481115813830661e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3848 }, { "completion_length": 562.5, "epoch": 1.0667960088691797, "grad_norm": 0.3418329060077667, "kl": 0.11279325187206268, "learning_rate": 4.480848756111953e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3849 }, { "completion_length": 556.0, "epoch": 1.0670731707317074, "grad_norm": 0.32641085982322693, "kl": 0.09292538464069366, "learning_rate": 4.4805816376488545e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3850 }, { "completion_length": 890.25, "epoch": 1.067350332594235, "grad_norm": 0.19590233266353607, "kl": 0.08038940280675888, "learning_rate": 4.480314458449555e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3851 }, { "completion_length": 537.25, "epoch": 1.0676274944567627, "grad_norm": 0.3504473567008972, "kl": 0.10288084298372269, "learning_rate": 4.48004721852225e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3852 }, { "completion_length": 479.75, "epoch": 1.0679046563192904, "grad_norm": 0.33429479598999023, "kl": 0.10634295642375946, "learning_rate": 4.479779917875133e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3853 }, { "completion_length": 543.25, "epoch": 1.0681818181818181, "grad_norm": 0.0, "kl": 0.11837620288133621, "learning_rate": 4.479512556516402e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3854 }, { "completion_length": 506.5, "epoch": 1.0684589800443458, "grad_norm": 0.36674872040748596, "kl": 0.10699735581874847, "learning_rate": 4.479245134454256e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3855 }, { "completion_length": 506.25, "epoch": 1.0687361419068737, "grad_norm": 0.35898810625076294, "kl": 0.11017053574323654, "learning_rate": 4.478977651696894e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3856 }, { "completion_length": 568.75, "epoch": 1.0690133037694014, "grad_norm": 0.0, "kl": 0.12604038417339325, "learning_rate": 4.478710108252521e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3857 }, { "completion_length": 553.25, "epoch": 1.069290465631929, "grad_norm": 0.0, "kl": 2.661600351333618, "learning_rate": 4.478442504129339e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3858 }, { "completion_length": 577.5, "epoch": 1.0695676274944568, "grad_norm": 0.3491274416446686, "kl": 0.10163949429988861, "learning_rate": 4.478174839335556e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3859 }, { "completion_length": 551.5, "epoch": 1.0698447893569845, "grad_norm": 0.0, "kl": 0.09645967185497284, "learning_rate": 4.47790711387938e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3860 }, { "completion_length": 536.5, "epoch": 1.0701219512195121, "grad_norm": 0.0, "kl": 0.1321275681257248, "learning_rate": 4.4776393277690205e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3861 }, { "completion_length": 606.0, "epoch": 1.0703991130820398, "grad_norm": 0.3157045543193817, "kl": 0.08384755998849869, "learning_rate": 4.477371481012689e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3862 }, { "completion_length": 651.0, "epoch": 1.0706762749445677, "grad_norm": 0.2557811141014099, "kl": 0.10562216490507126, "learning_rate": 4.477103573618601e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3863 }, { "completion_length": 492.75, "epoch": 1.0709534368070954, "grad_norm": 0.0, "kl": 0.11875399947166443, "learning_rate": 4.47683560559497e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3864 }, { "completion_length": 592.0, "epoch": 1.0712305986696231, "grad_norm": 0.3578028380870819, "kl": 0.16489197313785553, "learning_rate": 4.476567576950015e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3865 }, { "completion_length": 524.0, "epoch": 1.0715077605321508, "grad_norm": 0.0, "kl": 0.10790915787220001, "learning_rate": 4.476299487691954e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3866 }, { "completion_length": 584.5, "epoch": 1.0717849223946785, "grad_norm": 0.312832236289978, "kl": 0.106150783598423, "learning_rate": 4.47603133782901e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3867 }, { "completion_length": 503.25, "epoch": 1.0720620842572062, "grad_norm": 0.0, "kl": 0.11664025485515594, "learning_rate": 4.475763127369405e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3868 }, { "completion_length": 653.5, "epoch": 1.0723392461197339, "grad_norm": 0.0, "kl": 0.11528405547142029, "learning_rate": 4.475494856321362e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3869 }, { "completion_length": 575.5, "epoch": 1.0726164079822615, "grad_norm": 0.0, "kl": 0.1219368577003479, "learning_rate": 4.475226524693112e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3870 }, { "completion_length": 538.25, "epoch": 1.0728935698447895, "grad_norm": 0.38860660791397095, "kl": 0.27357032895088196, "learning_rate": 4.474958132492879e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3871 }, { "completion_length": 535.75, "epoch": 1.0731707317073171, "grad_norm": 0.31110629439353943, "kl": 0.11226949840784073, "learning_rate": 4.474689679728897e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3872 }, { "completion_length": 507.75, "epoch": 1.0734478935698448, "grad_norm": 0.0, "kl": 0.1345181167125702, "learning_rate": 4.474421166409397e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3873 }, { "completion_length": 557.5, "epoch": 1.0737250554323725, "grad_norm": 0.3628879487514496, "kl": 0.1058134213089943, "learning_rate": 4.474152592542613e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3874 }, { "completion_length": 533.75, "epoch": 1.0740022172949002, "grad_norm": 0.35061365365982056, "kl": 0.10759204626083374, "learning_rate": 4.473883958136781e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3875 }, { "completion_length": 631.0, "epoch": 1.0742793791574279, "grad_norm": 0.0, "kl": 0.48669734597206116, "learning_rate": 4.473615263200139e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3876 }, { "completion_length": 691.5, "epoch": 1.0745565410199556, "grad_norm": 0.0, "kl": 0.10910657048225403, "learning_rate": 4.473346507740928e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3877 }, { "completion_length": 509.0, "epoch": 1.0748337028824833, "grad_norm": 0.0, "kl": 0.13183645904064178, "learning_rate": 4.473077691767387e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3878 }, { "completion_length": 548.5, "epoch": 1.0751108647450112, "grad_norm": 0.0, "kl": 0.1097319945693016, "learning_rate": 4.472808815287763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3879 }, { "completion_length": 529.75, "epoch": 1.0753880266075388, "grad_norm": 0.34523847699165344, "kl": 0.10491378605365753, "learning_rate": 4.472539878310298e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3880 }, { "completion_length": 481.5, "epoch": 1.0756651884700665, "grad_norm": 0.3758555054664612, "kl": 0.15981535613536835, "learning_rate": 4.4722708808432405e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3881 }, { "completion_length": 412.5, "epoch": 1.0759423503325942, "grad_norm": 0.0, "kl": 0.2002960443496704, "learning_rate": 4.472001822894839e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3882 }, { "completion_length": 490.0, "epoch": 1.076219512195122, "grad_norm": 0.4101167619228363, "kl": 0.16734395921230316, "learning_rate": 4.471732704473346e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3883 }, { "completion_length": 452.75, "epoch": 1.0764966740576496, "grad_norm": 0.0, "kl": 0.11996642500162125, "learning_rate": 4.4714635255870125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3884 }, { "completion_length": 544.5, "epoch": 1.0767738359201773, "grad_norm": 0.0, "kl": 0.10358233749866486, "learning_rate": 4.471194286244094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3885 }, { "completion_length": 491.75, "epoch": 1.0770509977827052, "grad_norm": 0.4591425657272339, "kl": 0.13339385390281677, "learning_rate": 4.470924986452847e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3886 }, { "completion_length": 630.75, "epoch": 1.0773281596452329, "grad_norm": 0.31219491362571716, "kl": 0.09614508599042892, "learning_rate": 4.47065562622153e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3887 }, { "completion_length": 549.5, "epoch": 1.0776053215077606, "grad_norm": 0.0, "kl": 0.11688996106386185, "learning_rate": 4.470386205558401e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3888 }, { "completion_length": 600.0, "epoch": 1.0778824833702882, "grad_norm": 0.31763988733291626, "kl": 0.10334686189889908, "learning_rate": 4.470116724471724e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3889 }, { "completion_length": 500.75, "epoch": 1.078159645232816, "grad_norm": 0.3834162652492523, "kl": 0.15620474517345428, "learning_rate": 4.469847182969763e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3890 }, { "completion_length": 448.0, "epoch": 1.0784368070953436, "grad_norm": 0.0, "kl": 0.13086484372615814, "learning_rate": 4.469577581060784e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3891 }, { "completion_length": 599.75, "epoch": 1.0787139689578713, "grad_norm": 0.0, "kl": 0.09373059868812561, "learning_rate": 4.469307918753053e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3892 }, { "completion_length": 572.0, "epoch": 1.0789911308203992, "grad_norm": 0.0, "kl": 0.0950748398900032, "learning_rate": 4.46903819605484e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3893 }, { "completion_length": 586.75, "epoch": 1.079268292682927, "grad_norm": 0.3536205291748047, "kl": 0.1167118027806282, "learning_rate": 4.468768412974417e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3894 }, { "completion_length": 603.75, "epoch": 1.0795454545454546, "grad_norm": 0.3650248050689697, "kl": 0.11703480035066605, "learning_rate": 4.468498569520057e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3895 }, { "completion_length": 536.25, "epoch": 1.0798226164079823, "grad_norm": 0.0, "kl": 0.11911582201719284, "learning_rate": 4.468228665700034e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3896 }, { "completion_length": 581.75, "epoch": 1.08009977827051, "grad_norm": 0.3426099717617035, "kl": 0.1210223138332367, "learning_rate": 4.4679587015226255e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3897 }, { "completion_length": 551.0, "epoch": 1.0803769401330376, "grad_norm": 0.0, "kl": 0.1208638921380043, "learning_rate": 4.467688676996111e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3898 }, { "completion_length": 577.75, "epoch": 1.0806541019955653, "grad_norm": 0.38766369223594666, "kl": 0.10304785519838333, "learning_rate": 4.46741859212877e-06, "loss": -0.0, "reward": 3.59375, "reward_std": 2.5028629302978516, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 3899 }, { "completion_length": 488.0, "epoch": 1.0809312638580932, "grad_norm": 0.3481893241405487, "kl": 0.13352061808109283, "learning_rate": 4.467148446928884e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3900 }, { "completion_length": 575.25, "epoch": 1.081208425720621, "grad_norm": 0.3624461591243744, "kl": 0.11452268064022064, "learning_rate": 4.466878241404738e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3901 }, { "completion_length": 598.0, "epoch": 1.0814855875831486, "grad_norm": 0.33327946066856384, "kl": 0.09881305694580078, "learning_rate": 4.46660797556462e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3902 }, { "completion_length": 522.75, "epoch": 1.0817627494456763, "grad_norm": 0.0, "kl": 0.12188352644443512, "learning_rate": 4.466337649416815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3903 }, { "completion_length": 495.5, "epoch": 1.082039911308204, "grad_norm": 0.3566287159919739, "kl": 0.1081613302230835, "learning_rate": 4.466067262969614e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3904 }, { "completion_length": 494.0, "epoch": 1.0823170731707317, "grad_norm": 0.40530553460121155, "kl": 0.11200679838657379, "learning_rate": 4.465796816231309e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3905 }, { "completion_length": 513.75, "epoch": 1.0825942350332594, "grad_norm": 0.33501502871513367, "kl": 0.10299006849527359, "learning_rate": 4.465526309210194e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3906 }, { "completion_length": 491.0, "epoch": 1.082871396895787, "grad_norm": 0.0, "kl": 0.12690292298793793, "learning_rate": 4.465255741914562e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3907 }, { "completion_length": 497.25, "epoch": 1.083148558758315, "grad_norm": 0.0, "kl": 0.1118192970752716, "learning_rate": 4.464985114352713e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3908 }, { "completion_length": 504.0, "epoch": 1.0834257206208426, "grad_norm": 0.0, "kl": 0.11187462508678436, "learning_rate": 4.464714426532944e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3909 }, { "completion_length": 628.5, "epoch": 1.0837028824833703, "grad_norm": 0.0, "kl": 0.11882326751947403, "learning_rate": 4.464443678463557e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3910 }, { "completion_length": 595.75, "epoch": 1.083980044345898, "grad_norm": 0.3711872100830078, "kl": 0.1103300154209137, "learning_rate": 4.4641728701528535e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3911 }, { "completion_length": 504.0, "epoch": 1.0842572062084257, "grad_norm": 0.0, "kl": 0.12993432581424713, "learning_rate": 4.463902001609139e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3912 }, { "completion_length": 643.25, "epoch": 1.0845343680709534, "grad_norm": 0.0, "kl": 0.08731544017791748, "learning_rate": 4.463631072840721e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3913 }, { "completion_length": 519.5, "epoch": 1.084811529933481, "grad_norm": 0.3599438965320587, "kl": 0.13249167799949646, "learning_rate": 4.463360083855904e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3914 }, { "completion_length": 563.75, "epoch": 1.0850886917960088, "grad_norm": 0.3110060393810272, "kl": 0.12592118978500366, "learning_rate": 4.463089034663003e-06, "loss": 0.0, "reward": 4.71875, "reward_std": 1.980043649673462, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 3915 }, { "completion_length": 559.0, "epoch": 1.0853658536585367, "grad_norm": 0.3777741491794586, "kl": 0.11843852698802948, "learning_rate": 4.462817925270327e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3916 }, { "completion_length": 471.5, "epoch": 1.0856430155210643, "grad_norm": 0.0, "kl": 0.102101631462574, "learning_rate": 4.462546755686189e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3917 }, { "completion_length": 504.5, "epoch": 1.085920177383592, "grad_norm": 0.3512917459011078, "kl": 0.11441491544246674, "learning_rate": 4.462275525918908e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3918 }, { "completion_length": 491.0, "epoch": 1.0861973392461197, "grad_norm": 0.0, "kl": 0.11039070785045624, "learning_rate": 4.4620042359767976e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3919 }, { "completion_length": 689.25, "epoch": 1.0864745011086474, "grad_norm": 0.28880366683006287, "kl": 0.08209051936864853, "learning_rate": 4.4617328858681806e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3920 }, { "completion_length": 605.25, "epoch": 1.086751662971175, "grad_norm": 0.0, "kl": 0.08571800589561462, "learning_rate": 4.461461475601375e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3921 }, { "completion_length": 509.0, "epoch": 1.0870288248337028, "grad_norm": 0.0, "kl": 0.11615544557571411, "learning_rate": 4.461190005184707e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3922 }, { "completion_length": 536.0, "epoch": 1.0873059866962307, "grad_norm": 0.35669612884521484, "kl": 0.1075994074344635, "learning_rate": 4.460918474626499e-06, "loss": -0.0, "reward": 3.71875, "reward_std": 2.2738893032073975, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 3923 }, { "completion_length": 539.5, "epoch": 1.0875831485587584, "grad_norm": 0.0, "kl": 0.09574945271015167, "learning_rate": 4.460646883935079e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3924 }, { "completion_length": 501.5, "epoch": 1.087860310421286, "grad_norm": 0.0, "kl": 0.10773643851280212, "learning_rate": 4.460375233118774e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3925 }, { "completion_length": 545.25, "epoch": 1.0881374722838137, "grad_norm": 0.0, "kl": 0.1025720164179802, "learning_rate": 4.460103522185917e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3926 }, { "completion_length": 610.75, "epoch": 1.0884146341463414, "grad_norm": 0.0, "kl": 0.1001489982008934, "learning_rate": 4.459831751144839e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3927 }, { "completion_length": 643.0, "epoch": 1.0886917960088691, "grad_norm": 0.3212488889694214, "kl": 0.10862154513597488, "learning_rate": 4.459559920003873e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3928 }, { "completion_length": 584.0, "epoch": 1.0889689578713968, "grad_norm": 0.0, "kl": 0.08667468279600143, "learning_rate": 4.459288028771356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3929 }, { "completion_length": 530.75, "epoch": 1.0892461197339247, "grad_norm": 0.35202646255493164, "kl": 0.36347728967666626, "learning_rate": 4.4590160774556256e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3930 }, { "completion_length": 492.75, "epoch": 1.0895232815964524, "grad_norm": 0.37824609875679016, "kl": 0.1284056305885315, "learning_rate": 4.458744066065021e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3931 }, { "completion_length": 470.25, "epoch": 1.08980044345898, "grad_norm": 0.0, "kl": 0.13366323709487915, "learning_rate": 4.458471994607885e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3932 }, { "completion_length": 624.75, "epoch": 1.0900776053215078, "grad_norm": 0.3301818370819092, "kl": 0.10706058144569397, "learning_rate": 4.4581998630925595e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3933 }, { "completion_length": 508.25, "epoch": 1.0903547671840355, "grad_norm": 0.0, "kl": 0.12937143445014954, "learning_rate": 4.45792767152739e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3934 }, { "completion_length": 537.0, "epoch": 1.0906319290465631, "grad_norm": 0.3877791464328766, "kl": 0.11688411980867386, "learning_rate": 4.457655419920723e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3935 }, { "completion_length": 613.5, "epoch": 1.0909090909090908, "grad_norm": 0.0, "kl": 0.10311446338891983, "learning_rate": 4.457383108280909e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3936 }, { "completion_length": 552.5, "epoch": 1.0911862527716187, "grad_norm": 0.3679512143135071, "kl": 0.11543704569339752, "learning_rate": 4.457110736616297e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3937 }, { "completion_length": 488.25, "epoch": 1.0914634146341464, "grad_norm": 0.39457976818084717, "kl": 0.09980925172567368, "learning_rate": 4.456838304935241e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3938 }, { "completion_length": 512.5, "epoch": 1.091740576496674, "grad_norm": 0.37989872694015503, "kl": 0.13110916316509247, "learning_rate": 4.456565813246093e-06, "loss": -0.0, "reward": 4.375, "reward_std": 2.75, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3939 }, { "completion_length": 636.0, "epoch": 1.0920177383592018, "grad_norm": 0.0, "kl": 0.12843753397464752, "learning_rate": 4.4562932615572105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3940 }, { "completion_length": 441.75, "epoch": 1.0922949002217295, "grad_norm": 0.5372626781463623, "kl": 0.14109553396701813, "learning_rate": 4.4560206498769536e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3941 }, { "completion_length": 627.0, "epoch": 1.0925720620842572, "grad_norm": 0.3086152970790863, "kl": 0.1040344089269638, "learning_rate": 4.455747978213679e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3942 }, { "completion_length": 500.25, "epoch": 1.0928492239467849, "grad_norm": 0.0, "kl": 0.12624570727348328, "learning_rate": 4.455475246575749e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3943 }, { "completion_length": 538.75, "epoch": 1.0931263858093125, "grad_norm": 0.0, "kl": 0.14023767411708832, "learning_rate": 4.455202454971529e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3944 }, { "completion_length": 657.75, "epoch": 1.0934035476718404, "grad_norm": 0.3105688691139221, "kl": 0.1065453290939331, "learning_rate": 4.454929603409382e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3945 }, { "completion_length": 666.75, "epoch": 1.0936807095343681, "grad_norm": 0.0, "kl": 0.1273995190858841, "learning_rate": 4.4546566918976775e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3946 }, { "completion_length": 617.25, "epoch": 1.0939578713968958, "grad_norm": 0.0, "kl": 0.09986800700426102, "learning_rate": 4.454383720444782e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3947 }, { "completion_length": 996.75, "epoch": 1.0942350332594235, "grad_norm": 0.0, "kl": 0.06541034579277039, "learning_rate": 4.454110689059069e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3948 }, { "completion_length": 516.0, "epoch": 1.0945121951219512, "grad_norm": 0.0, "kl": 0.11366544663906097, "learning_rate": 4.4538375977489105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3949 }, { "completion_length": 520.75, "epoch": 1.0947893569844789, "grad_norm": 0.45291751623153687, "kl": 0.12066676467657089, "learning_rate": 4.4535644465226795e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3950 }, { "completion_length": 576.0, "epoch": 1.0950665188470066, "grad_norm": 0.0, "kl": 0.2226923108100891, "learning_rate": 4.453291235388753e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3951 }, { "completion_length": 507.5, "epoch": 1.0953436807095343, "grad_norm": 0.34537801146507263, "kl": 0.10184930264949799, "learning_rate": 4.4530179643555115e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3952 }, { "completion_length": 585.75, "epoch": 1.0956208425720622, "grad_norm": 0.0, "kl": 0.1113768219947815, "learning_rate": 4.452744633431333e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3953 }, { "completion_length": 598.0, "epoch": 1.0958980044345898, "grad_norm": 0.0, "kl": 0.10921933501958847, "learning_rate": 4.452471242624599e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3954 }, { "completion_length": 458.25, "epoch": 1.0961751662971175, "grad_norm": 0.0, "kl": 0.12578821182250977, "learning_rate": 4.452197791943695e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3955 }, { "completion_length": 554.25, "epoch": 1.0964523281596452, "grad_norm": 0.0, "kl": 0.10338430106639862, "learning_rate": 4.451924281397005e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3956 }, { "completion_length": 778.75, "epoch": 1.096729490022173, "grad_norm": 0.2459339201450348, "kl": 0.08198656141757965, "learning_rate": 4.451650710992918e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3957 }, { "completion_length": 550.5, "epoch": 1.0970066518847006, "grad_norm": 0.0, "kl": 0.12010055035352707, "learning_rate": 4.451377080739821e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3958 }, { "completion_length": 573.5, "epoch": 1.0972838137472283, "grad_norm": 0.0, "kl": 0.12190345674753189, "learning_rate": 4.451103390646107e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3959 }, { "completion_length": 647.5, "epoch": 1.0975609756097562, "grad_norm": 0.0, "kl": 0.09076445549726486, "learning_rate": 4.450829640720168e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3960 }, { "completion_length": 551.25, "epoch": 1.0978381374722839, "grad_norm": 0.0, "kl": 0.12919044494628906, "learning_rate": 4.4505558309704e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3961 }, { "completion_length": 598.5, "epoch": 1.0981152993348116, "grad_norm": 0.3007274270057678, "kl": 0.09497939050197601, "learning_rate": 4.450281961405198e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3962 }, { "completion_length": 743.5, "epoch": 1.0983924611973392, "grad_norm": 0.3409399688243866, "kl": 0.08799545466899872, "learning_rate": 4.4500080320329615e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3963 }, { "completion_length": 570.75, "epoch": 1.098669623059867, "grad_norm": 0.0, "kl": 0.10764020681381226, "learning_rate": 4.449734042862092e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3964 }, { "completion_length": 562.75, "epoch": 1.0989467849223946, "grad_norm": 0.49009838700294495, "kl": 0.11277312785387039, "learning_rate": 4.4494599939009885e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3965 }, { "completion_length": 636.25, "epoch": 1.0992239467849223, "grad_norm": 0.31531909108161926, "kl": 0.11186450719833374, "learning_rate": 4.449185885158056e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3966 }, { "completion_length": 626.75, "epoch": 1.0995011086474502, "grad_norm": 0.0, "kl": 0.10265056788921356, "learning_rate": 4.448911716641702e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3967 }, { "completion_length": 512.0, "epoch": 1.099778270509978, "grad_norm": 0.350080668926239, "kl": 0.13965444266796112, "learning_rate": 4.448637488360333e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3968 }, { "completion_length": 559.5, "epoch": 1.1000554323725056, "grad_norm": 0.0, "kl": 0.11110928654670715, "learning_rate": 4.448363200322358e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3969 }, { "completion_length": 601.75, "epoch": 1.1003325942350333, "grad_norm": 0.0, "kl": 0.1211012601852417, "learning_rate": 4.4480888525361876e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3970 }, { "completion_length": 630.0, "epoch": 1.100609756097561, "grad_norm": 0.0, "kl": 0.11645912379026413, "learning_rate": 4.447814445010237e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3971 }, { "completion_length": 633.25, "epoch": 1.1008869179600886, "grad_norm": 0.3404337763786316, "kl": 0.12003887444734573, "learning_rate": 4.44753997775292e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3972 }, { "completion_length": 565.0, "epoch": 1.1011640798226163, "grad_norm": 0.0, "kl": 0.10576701909303665, "learning_rate": 4.447265450772653e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3973 }, { "completion_length": 588.25, "epoch": 1.1014412416851442, "grad_norm": 0.0, "kl": 0.1173822209239006, "learning_rate": 4.446990864077856e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3974 }, { "completion_length": 517.25, "epoch": 1.101718403547672, "grad_norm": 0.0, "kl": 0.11940927058458328, "learning_rate": 4.446716217676947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3975 }, { "completion_length": 563.5, "epoch": 1.1019955654101996, "grad_norm": 0.40370693802833557, "kl": 0.10570406913757324, "learning_rate": 4.446441511578351e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3976 }, { "completion_length": 646.25, "epoch": 1.1022727272727273, "grad_norm": 0.0, "kl": 0.12236551940441132, "learning_rate": 4.446166745790491e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3977 }, { "completion_length": 630.25, "epoch": 1.102549889135255, "grad_norm": 0.0, "kl": 0.08666589111089706, "learning_rate": 4.4458919203217925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3978 }, { "completion_length": 560.0, "epoch": 1.1028270509977827, "grad_norm": 0.3203895390033722, "kl": 0.11485198140144348, "learning_rate": 4.445617035180683e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3979 }, { "completion_length": 553.25, "epoch": 1.1031042128603104, "grad_norm": 0.0, "kl": 0.11634854972362518, "learning_rate": 4.445342090375593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3980 }, { "completion_length": 683.75, "epoch": 1.103381374722838, "grad_norm": 0.0, "kl": 0.10263050347566605, "learning_rate": 4.445067085914953e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3981 }, { "completion_length": 638.25, "epoch": 1.103658536585366, "grad_norm": 0.0, "kl": 0.08796016126871109, "learning_rate": 4.444792021807197e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3982 }, { "completion_length": 548.5, "epoch": 1.1039356984478936, "grad_norm": 0.0, "kl": 0.1507130116224289, "learning_rate": 4.44451689806076e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3983 }, { "completion_length": 599.25, "epoch": 1.1042128603104213, "grad_norm": 0.0, "kl": 0.14247944951057434, "learning_rate": 4.444241714684079e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3984 }, { "completion_length": 579.75, "epoch": 1.104490022172949, "grad_norm": 0.0, "kl": 0.13393402099609375, "learning_rate": 4.443966471685592e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3985 }, { "completion_length": 543.0, "epoch": 1.1047671840354767, "grad_norm": 0.0, "kl": 0.1384744942188263, "learning_rate": 4.443691169073741e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3986 }, { "completion_length": 586.5, "epoch": 1.1050443458980044, "grad_norm": 0.33430424332618713, "kl": 0.10918036848306656, "learning_rate": 4.443415806856966e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3987 }, { "completion_length": 910.25, "epoch": 1.105321507760532, "grad_norm": 0.30363988876342773, "kl": 0.08143271505832672, "learning_rate": 4.443140385043713e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3988 }, { "completion_length": 577.0, "epoch": 1.10559866962306, "grad_norm": 0.0, "kl": 0.1259208619594574, "learning_rate": 4.442864903642428e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3989 }, { "completion_length": 641.0, "epoch": 1.1058758314855877, "grad_norm": 0.33248400688171387, "kl": 0.12206875532865524, "learning_rate": 4.442589362661558e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3990 }, { "completion_length": 592.25, "epoch": 1.1061529933481153, "grad_norm": 0.44182834029197693, "kl": 0.11614920198917389, "learning_rate": 4.442313762109554e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3991 }, { "completion_length": 649.25, "epoch": 1.106430155210643, "grad_norm": 0.32078853249549866, "kl": 0.15140025317668915, "learning_rate": 4.442038101994866e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3992 }, { "completion_length": 587.25, "epoch": 1.1067073170731707, "grad_norm": 0.0, "kl": 0.10016050934791565, "learning_rate": 4.441762382325948e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3993 }, { "completion_length": 564.25, "epoch": 1.1069844789356984, "grad_norm": 0.2991292476654053, "kl": 0.09389782696962357, "learning_rate": 4.441486603111256e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3994 }, { "completion_length": 602.0, "epoch": 1.107261640798226, "grad_norm": 0.0, "kl": 0.1234038844704628, "learning_rate": 4.441210764359246e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3995 }, { "completion_length": 565.5, "epoch": 1.1075388026607538, "grad_norm": 0.3923352360725403, "kl": 0.11082062125205994, "learning_rate": 4.440934866078377e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3996 }, { "completion_length": 589.25, "epoch": 1.1078159645232817, "grad_norm": 0.0, "kl": 0.10739375650882721, "learning_rate": 4.4406589082771095e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3997 }, { "completion_length": 711.75, "epoch": 1.1080931263858094, "grad_norm": 0.31307274103164673, "kl": 0.10515294224023819, "learning_rate": 4.440382890963907e-06, "loss": -0.0, "reward": 1.71875, "reward_std": 0.0625, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 3998 }, { "completion_length": 591.75, "epoch": 1.108370288248337, "grad_norm": 0.0, "kl": 0.08979138731956482, "learning_rate": 4.440106814147232e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 3999 }, { "completion_length": 643.75, "epoch": 1.1086474501108647, "grad_norm": 0.0, "kl": 0.09636707603931427, "learning_rate": 4.439830677835552e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4000 }, { "completion_length": 620.0, "epoch": 1.1089246119733924, "grad_norm": 0.0, "kl": 0.1072012335062027, "learning_rate": 4.439554482037335e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4001 }, { "completion_length": 581.5, "epoch": 1.1092017738359201, "grad_norm": 0.0, "kl": 0.10111614316701889, "learning_rate": 4.43927822676105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4002 }, { "completion_length": 605.25, "epoch": 1.1094789356984478, "grad_norm": 0.0, "kl": 0.09683984518051147, "learning_rate": 4.439001912015169e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4003 }, { "completion_length": 619.0, "epoch": 1.1097560975609757, "grad_norm": 0.0, "kl": 0.09946690499782562, "learning_rate": 4.438725537808165e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4004 }, { "completion_length": 634.0, "epoch": 1.1100332594235034, "grad_norm": 0.31993547081947327, "kl": 0.10573668032884598, "learning_rate": 4.4384491041485145e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4005 }, { "completion_length": 610.75, "epoch": 1.110310421286031, "grad_norm": 0.0, "kl": 0.13123318552970886, "learning_rate": 4.438172611044694e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4006 }, { "completion_length": 926.25, "epoch": 1.1105875831485588, "grad_norm": 0.0, "kl": 0.11115503311157227, "learning_rate": 4.437896058505181e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4007 }, { "completion_length": 648.75, "epoch": 1.1108647450110865, "grad_norm": 0.3040906488895416, "kl": 0.10066103935241699, "learning_rate": 4.437619446538458e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4008 }, { "completion_length": 624.0, "epoch": 1.1111419068736141, "grad_norm": 0.0, "kl": 0.12006114423274994, "learning_rate": 4.437342775153007e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4009 }, { "completion_length": 615.75, "epoch": 1.1114190687361418, "grad_norm": 0.42140868306159973, "kl": 0.13277265429496765, "learning_rate": 4.437066044357312e-06, "loss": 0.0, "reward": 1.5625, "reward_std": 0.375, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 4010 }, { "completion_length": 657.25, "epoch": 1.1116962305986697, "grad_norm": 0.0, "kl": 0.08434519171714783, "learning_rate": 4.436789254159859e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4011 }, { "completion_length": 598.5, "epoch": 1.1119733924611974, "grad_norm": 0.3249695599079132, "kl": 0.10693563520908356, "learning_rate": 4.436512404569136e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4012 }, { "completion_length": 588.0, "epoch": 1.112250554323725, "grad_norm": 0.3400266170501709, "kl": 0.10436404496431351, "learning_rate": 4.436235495593635e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4013 }, { "completion_length": 606.0, "epoch": 1.1125277161862528, "grad_norm": 0.3279305696487427, "kl": 0.1292479932308197, "learning_rate": 4.435958527241845e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4014 }, { "completion_length": 617.5, "epoch": 1.1128048780487805, "grad_norm": 0.0, "kl": 0.10330881923437119, "learning_rate": 4.43568149952226e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4015 }, { "completion_length": 539.0, "epoch": 1.1130820399113082, "grad_norm": 0.0, "kl": 0.10615560412406921, "learning_rate": 4.435404412443376e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4016 }, { "completion_length": 569.25, "epoch": 1.1133592017738358, "grad_norm": 0.0, "kl": 0.12396739423274994, "learning_rate": 4.43512726601369e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4017 }, { "completion_length": 479.75, "epoch": 1.1136363636363635, "grad_norm": 0.33753347396850586, "kl": 0.14164088666439056, "learning_rate": 4.4348500602417e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4018 }, { "completion_length": 609.25, "epoch": 1.1139135254988914, "grad_norm": 0.0, "kl": 0.10813036561012268, "learning_rate": 4.434572795135908e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4019 }, { "completion_length": 615.5, "epoch": 1.1141906873614191, "grad_norm": 0.3058982193470001, "kl": 0.10025778412818909, "learning_rate": 4.434295470704816e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4020 }, { "completion_length": 504.5, "epoch": 1.1144678492239468, "grad_norm": 0.45508819818496704, "kl": 0.15197674930095673, "learning_rate": 4.434018086956928e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4021 }, { "completion_length": 643.5, "epoch": 1.1147450110864745, "grad_norm": 0.29358717799186707, "kl": 0.10092262178659439, "learning_rate": 4.43374064390075e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4022 }, { "completion_length": 562.0, "epoch": 1.1150221729490022, "grad_norm": 0.3055669069290161, "kl": 0.11267571896314621, "learning_rate": 4.433463141544791e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4023 }, { "completion_length": 665.0, "epoch": 1.1152993348115299, "grad_norm": 0.2965506613254547, "kl": 0.08665721863508224, "learning_rate": 4.433185579897562e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4024 }, { "completion_length": 535.0, "epoch": 1.1155764966740576, "grad_norm": 0.0, "kl": 0.10833076387643814, "learning_rate": 4.4329079589675715e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4025 }, { "completion_length": 512.5, "epoch": 1.1158536585365855, "grad_norm": 0.0, "kl": 0.2942132353782654, "learning_rate": 4.4326302787633356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4026 }, { "completion_length": 596.25, "epoch": 1.1161308203991132, "grad_norm": 0.3238367438316345, "kl": 0.21993260085582733, "learning_rate": 4.432352539293367e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4027 }, { "completion_length": 640.0, "epoch": 1.1164079822616408, "grad_norm": 0.30258527398109436, "kl": 0.09471262246370316, "learning_rate": 4.432074740566185e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4028 }, { "completion_length": 603.0, "epoch": 1.1166851441241685, "grad_norm": 0.42397576570510864, "kl": 0.10796574503183365, "learning_rate": 4.43179688259031e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4029 }, { "completion_length": 611.0, "epoch": 1.1169623059866962, "grad_norm": 0.3376257121562958, "kl": 0.1079874187707901, "learning_rate": 4.431518965374259e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4030 }, { "completion_length": 532.75, "epoch": 1.117239467849224, "grad_norm": 0.0, "kl": 0.09517070651054382, "learning_rate": 4.431240988926556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4031 }, { "completion_length": 1066.5, "epoch": 1.1175166297117516, "grad_norm": 0.2768145203590393, "kl": 1.096986174583435, "learning_rate": 4.430962953255725e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4032 }, { "completion_length": 575.0, "epoch": 1.1177937915742793, "grad_norm": 0.0, "kl": 0.10664314776659012, "learning_rate": 4.4306848583702945e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4033 }, { "completion_length": 646.5, "epoch": 1.1180709534368072, "grad_norm": 0.31926390528678894, "kl": 0.09337544441223145, "learning_rate": 4.4304067042787905e-06, "loss": -0.0, "reward": 2.84375, "reward_std": 1.8690879344940186, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 4034 }, { "completion_length": 583.25, "epoch": 1.1183481152993349, "grad_norm": 0.0, "kl": 0.1164097785949707, "learning_rate": 4.430128490989743e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4035 }, { "completion_length": 915.75, "epoch": 1.1186252771618626, "grad_norm": 0.19540858268737793, "kl": 0.09341863542795181, "learning_rate": 4.4298502185116835e-06, "loss": -0.0, "reward": 1.09375, "reward_std": 1.3125, "rewards/confident_score_func": -0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 4036 }, { "completion_length": 598.5, "epoch": 1.1189024390243902, "grad_norm": 0.0, "kl": 0.13628369569778442, "learning_rate": 4.429571886853146e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4037 }, { "completion_length": 632.25, "epoch": 1.119179600886918, "grad_norm": 0.0, "kl": 0.11660019308328629, "learning_rate": 4.429293496022664e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4038 }, { "completion_length": 540.25, "epoch": 1.1194567627494456, "grad_norm": 0.0, "kl": 0.18874619901180267, "learning_rate": 4.429015046028779e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4039 }, { "completion_length": 685.25, "epoch": 1.1197339246119733, "grad_norm": 0.2710604965686798, "kl": 0.10870540142059326, "learning_rate": 4.428736536880025e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4040 }, { "completion_length": 599.5, "epoch": 1.1200110864745012, "grad_norm": 0.0, "kl": 0.08886770904064178, "learning_rate": 4.428457968584945e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4041 }, { "completion_length": 604.5, "epoch": 1.120288248337029, "grad_norm": 0.3601745665073395, "kl": 0.13353988528251648, "learning_rate": 4.428179341152083e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4042 }, { "completion_length": 593.0, "epoch": 1.1205654101995566, "grad_norm": 0.0, "kl": 0.13469211757183075, "learning_rate": 4.427900654589979e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4043 }, { "completion_length": 568.75, "epoch": 1.1208425720620843, "grad_norm": 0.30944907665252686, "kl": 0.11850277334451675, "learning_rate": 4.427621908907183e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4044 }, { "completion_length": 691.75, "epoch": 1.121119733924612, "grad_norm": 0.0, "kl": 0.09262877702713013, "learning_rate": 4.4273431041122425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4045 }, { "completion_length": 606.5, "epoch": 1.1213968957871396, "grad_norm": 0.0, "kl": 0.09790804237127304, "learning_rate": 4.427064240213706e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4046 }, { "completion_length": 619.0, "epoch": 1.1216740576496673, "grad_norm": 0.0, "kl": 0.1002490222454071, "learning_rate": 4.426785317220125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4047 }, { "completion_length": 557.75, "epoch": 1.1219512195121952, "grad_norm": 0.0, "kl": 0.1337081640958786, "learning_rate": 4.426506335140054e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4048 }, { "completion_length": 679.25, "epoch": 1.122228381374723, "grad_norm": 0.3162086009979248, "kl": 0.10654155910015106, "learning_rate": 4.426227293982048e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4049 }, { "completion_length": 528.75, "epoch": 1.1225055432372506, "grad_norm": 0.35030120611190796, "kl": 0.1364450454711914, "learning_rate": 4.425948193754664e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4050 }, { "completion_length": 523.0, "epoch": 1.1227827050997783, "grad_norm": 0.0, "kl": 0.11595366150140762, "learning_rate": 4.42566903446646e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4051 }, { "completion_length": 543.25, "epoch": 1.123059866962306, "grad_norm": 0.35409605503082275, "kl": 0.1282186359167099, "learning_rate": 4.425389816125998e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4052 }, { "completion_length": 562.25, "epoch": 1.1233370288248337, "grad_norm": 0.3506770431995392, "kl": 0.10987795144319534, "learning_rate": 4.425110538741839e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4053 }, { "completion_length": 560.0, "epoch": 1.1236141906873613, "grad_norm": 0.0, "kl": 0.08236589282751083, "learning_rate": 4.424831202322548e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4054 }, { "completion_length": 546.0, "epoch": 1.123891352549889, "grad_norm": 0.0, "kl": 0.1401853710412979, "learning_rate": 4.424551806876692e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4055 }, { "completion_length": 530.75, "epoch": 1.124168514412417, "grad_norm": 0.0, "kl": 0.12750723958015442, "learning_rate": 4.424272352412837e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4056 }, { "completion_length": 562.25, "epoch": 1.1244456762749446, "grad_norm": 0.0, "kl": 0.11163078993558884, "learning_rate": 4.423992838939555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4057 }, { "completion_length": 609.5, "epoch": 1.1247228381374723, "grad_norm": 0.0, "kl": 0.1103503406047821, "learning_rate": 4.423713266465415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4058 }, { "completion_length": 593.75, "epoch": 1.125, "grad_norm": 0.0, "kl": 0.1271127611398697, "learning_rate": 4.423433634998993e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4059 }, { "completion_length": 685.5, "epoch": 1.1252771618625277, "grad_norm": 0.2959314286708832, "kl": 0.10314217209815979, "learning_rate": 4.423153944548861e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4060 }, { "completion_length": 588.5, "epoch": 1.1255543237250554, "grad_norm": 0.0, "kl": 0.11252515017986298, "learning_rate": 4.422874195123598e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4061 }, { "completion_length": 513.5, "epoch": 1.125831485587583, "grad_norm": 0.0, "kl": 0.10279049724340439, "learning_rate": 4.4225943867317835e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4062 }, { "completion_length": 641.75, "epoch": 1.1261086474501107, "grad_norm": 0.0, "kl": 0.26877495646476746, "learning_rate": 4.4223145193819964e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4063 }, { "completion_length": 510.0, "epoch": 1.1263858093126387, "grad_norm": 0.0, "kl": 0.09131129831075668, "learning_rate": 4.422034593082819e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4064 }, { "completion_length": 601.75, "epoch": 1.1266629711751663, "grad_norm": 0.3501296937465668, "kl": 0.09778490662574768, "learning_rate": 4.421754607842837e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4065 }, { "completion_length": 606.25, "epoch": 1.126940133037694, "grad_norm": 0.3080967664718628, "kl": 0.10522913932800293, "learning_rate": 4.421474563670635e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4066 }, { "completion_length": 689.0, "epoch": 1.1272172949002217, "grad_norm": 0.27618077397346497, "kl": 0.09488727152347565, "learning_rate": 4.4211944605748016e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4067 }, { "completion_length": 678.5, "epoch": 1.1274944567627494, "grad_norm": 0.2916496694087982, "kl": 0.1001235619187355, "learning_rate": 4.420914298563925e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4068 }, { "completion_length": 546.75, "epoch": 1.127771618625277, "grad_norm": 0.0, "kl": 0.1518857181072235, "learning_rate": 4.420634077646598e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4069 }, { "completion_length": 629.25, "epoch": 1.1280487804878048, "grad_norm": 0.0, "kl": 0.08985007554292679, "learning_rate": 4.4203537978314146e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4070 }, { "completion_length": 552.75, "epoch": 1.1283259423503327, "grad_norm": 0.0, "kl": 0.1198124885559082, "learning_rate": 4.4200734591269675e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4071 }, { "completion_length": 602.75, "epoch": 1.1286031042128604, "grad_norm": 0.0, "kl": 0.09646792709827423, "learning_rate": 4.419793061541856e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4072 }, { "completion_length": 630.25, "epoch": 1.128880266075388, "grad_norm": 0.28889790177345276, "kl": 0.10185536742210388, "learning_rate": 4.419512605084677e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4073 }, { "completion_length": 636.0, "epoch": 1.1291574279379157, "grad_norm": 0.0, "kl": 0.12823493778705597, "learning_rate": 4.41923208976403e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4074 }, { "completion_length": 576.0, "epoch": 1.1294345898004434, "grad_norm": 0.0, "kl": 0.10625044256448746, "learning_rate": 4.41895151558852e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4075 }, { "completion_length": 599.5, "epoch": 1.129711751662971, "grad_norm": 0.0, "kl": 0.10475967824459076, "learning_rate": 4.4186708825667495e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4076 }, { "completion_length": 629.25, "epoch": 1.1299889135254988, "grad_norm": 0.0, "kl": 0.10704852640628815, "learning_rate": 4.418390190707324e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4077 }, { "completion_length": 624.75, "epoch": 1.1302660753880267, "grad_norm": 0.0, "kl": 0.10178109258413315, "learning_rate": 4.418109440018852e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4078 }, { "completion_length": 575.5, "epoch": 1.1305432372505544, "grad_norm": 0.33532729744911194, "kl": 0.1005312129855156, "learning_rate": 4.417828630509943e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4079 }, { "completion_length": 468.75, "epoch": 1.130820399113082, "grad_norm": 0.3790353834629059, "kl": 0.12476608902215958, "learning_rate": 4.417547762189207e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4080 }, { "completion_length": 620.5, "epoch": 1.1310975609756098, "grad_norm": 0.0, "kl": 0.11613646149635315, "learning_rate": 4.417266835065259e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4081 }, { "completion_length": 536.75, "epoch": 1.1313747228381374, "grad_norm": 0.0, "kl": 0.12389133125543594, "learning_rate": 4.416985849146712e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4082 }, { "completion_length": 723.25, "epoch": 1.1316518847006651, "grad_norm": 0.0, "kl": 0.097527876496315, "learning_rate": 4.416704804442183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4083 }, { "completion_length": 548.75, "epoch": 1.1319290465631928, "grad_norm": 0.3097502589225769, "kl": 0.11253244429826736, "learning_rate": 4.416423700960293e-06, "loss": 0.0, "reward": 2.09375, "reward_std": 2.7336158752441406, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 4084 }, { "completion_length": 749.75, "epoch": 1.1322062084257207, "grad_norm": 0.0, "kl": 0.10456366837024689, "learning_rate": 4.416142538709658e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4085 }, { "completion_length": 566.75, "epoch": 1.1324833702882484, "grad_norm": 0.45218583941459656, "kl": 0.08907363563776016, "learning_rate": 4.415861317698903e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4086 }, { "completion_length": 603.5, "epoch": 1.132760532150776, "grad_norm": 0.0, "kl": 0.11202310025691986, "learning_rate": 4.415580037936652e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4087 }, { "completion_length": 577.0, "epoch": 1.1330376940133038, "grad_norm": 0.45284944772720337, "kl": 0.11471308767795563, "learning_rate": 4.415298699431528e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4088 }, { "completion_length": 601.25, "epoch": 1.1333148558758315, "grad_norm": 0.2814381420612335, "kl": 0.10614064335823059, "learning_rate": 4.4150173021921625e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4089 }, { "completion_length": 536.75, "epoch": 1.1335920177383592, "grad_norm": 0.0, "kl": 0.12569616734981537, "learning_rate": 4.414735846227181e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4090 }, { "completion_length": 578.0, "epoch": 1.1338691796008868, "grad_norm": 0.32921960949897766, "kl": 0.08386282622814178, "learning_rate": 4.414454331545217e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4091 }, { "completion_length": 574.0, "epoch": 1.1341463414634148, "grad_norm": 0.35958388447761536, "kl": 0.10619501024484634, "learning_rate": 4.4141727581549025e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4092 }, { "completion_length": 609.25, "epoch": 1.1344235033259424, "grad_norm": 0.0, "kl": 0.09862134605646133, "learning_rate": 4.413891126064872e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4093 }, { "completion_length": 635.0, "epoch": 1.1347006651884701, "grad_norm": 0.0, "kl": 0.11736235022544861, "learning_rate": 4.413609435283762e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4094 }, { "completion_length": 916.0, "epoch": 1.1349778270509978, "grad_norm": 0.20281945168972015, "kl": 0.09204103797674179, "learning_rate": 4.413327685820213e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4095 }, { "completion_length": 602.75, "epoch": 1.1352549889135255, "grad_norm": 0.0, "kl": 0.09412579983472824, "learning_rate": 4.41304587768286e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4096 }, { "completion_length": 597.75, "epoch": 1.1355321507760532, "grad_norm": 0.3493887484073639, "kl": 0.11007765680551529, "learning_rate": 4.41276401088035e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4097 }, { "completion_length": 490.0, "epoch": 1.1358093126385809, "grad_norm": 0.0, "kl": 0.1300327330827713, "learning_rate": 4.412482085421325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4098 }, { "completion_length": 546.5, "epoch": 1.1360864745011086, "grad_norm": 0.0, "kl": 0.14969117939472198, "learning_rate": 4.412200101314429e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4099 }, { "completion_length": 511.5, "epoch": 1.1363636363636362, "grad_norm": 0.35608071088790894, "kl": 0.11898753046989441, "learning_rate": 4.411918058568311e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4100 }, { "completion_length": 678.25, "epoch": 1.1366407982261642, "grad_norm": 0.0, "kl": 0.10382666438817978, "learning_rate": 4.411635957191619e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4101 }, { "completion_length": 681.25, "epoch": 1.1369179600886918, "grad_norm": 0.253475546836853, "kl": 0.10210271179676056, "learning_rate": 4.411353797193005e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4102 }, { "completion_length": 568.25, "epoch": 1.1371951219512195, "grad_norm": 0.0, "kl": 0.14174683392047882, "learning_rate": 4.4110715785811205e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4103 }, { "completion_length": 677.25, "epoch": 1.1374722838137472, "grad_norm": 0.32431620359420776, "kl": 0.097654789686203, "learning_rate": 4.410789301364621e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4104 }, { "completion_length": 685.25, "epoch": 1.137749445676275, "grad_norm": 0.3462902009487152, "kl": 0.12412061542272568, "learning_rate": 4.410506965552162e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4105 }, { "completion_length": 603.75, "epoch": 1.1380266075388026, "grad_norm": 0.0, "kl": 0.12590675055980682, "learning_rate": 4.410224571152402e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4106 }, { "completion_length": 502.25, "epoch": 1.1383037694013303, "grad_norm": 0.40798768401145935, "kl": 0.12073635309934616, "learning_rate": 4.409942118174001e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4107 }, { "completion_length": 535.5, "epoch": 1.1385809312638582, "grad_norm": 0.3219776451587677, "kl": 0.11216293275356293, "learning_rate": 4.40965960662562e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4108 }, { "completion_length": 594.75, "epoch": 1.1388580931263859, "grad_norm": 0.0, "kl": 0.10584395378828049, "learning_rate": 4.409377036515924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4109 }, { "completion_length": 571.25, "epoch": 1.1391352549889135, "grad_norm": 0.3367953896522522, "kl": 0.11908408999443054, "learning_rate": 4.409094407853576e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4110 }, { "completion_length": 606.0, "epoch": 1.1394124168514412, "grad_norm": 0.3478521406650543, "kl": 0.11043468862771988, "learning_rate": 4.408811720647244e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4111 }, { "completion_length": 615.75, "epoch": 1.139689578713969, "grad_norm": 0.3373129665851593, "kl": 0.10689879208803177, "learning_rate": 4.408528974905597e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4112 }, { "completion_length": 501.25, "epoch": 1.1399667405764966, "grad_norm": 0.3114277422428131, "kl": 0.13255344331264496, "learning_rate": 4.408246170637307e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4113 }, { "completion_length": 646.5, "epoch": 1.1402439024390243, "grad_norm": 0.39754173159599304, "kl": 0.10424871742725372, "learning_rate": 4.407963307851044e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4114 }, { "completion_length": 669.75, "epoch": 1.1405210643015522, "grad_norm": 0.30510497093200684, "kl": 0.13412421941757202, "learning_rate": 4.407680386555483e-06, "loss": 0.0, "reward": 2.71875, "reward_std": 2.0216922760009766, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 4115 }, { "completion_length": 637.0, "epoch": 1.1407982261640799, "grad_norm": 0.0, "kl": 0.10647991299629211, "learning_rate": 4.4073974067593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4116 }, { "completion_length": 597.25, "epoch": 1.1410753880266076, "grad_norm": 0.0, "kl": 0.2663760185241699, "learning_rate": 4.407114368471174e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4117 }, { "completion_length": 451.5, "epoch": 1.1413525498891353, "grad_norm": 0.698698103427887, "kl": 0.1737302988767624, "learning_rate": 4.406831271699783e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4118 }, { "completion_length": 648.0, "epoch": 1.141629711751663, "grad_norm": 0.0, "kl": 0.2873891294002533, "learning_rate": 4.40654811645381e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4119 }, { "completion_length": 578.0, "epoch": 1.1419068736141906, "grad_norm": 0.0, "kl": 0.11429549753665924, "learning_rate": 4.406264902741937e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4120 }, { "completion_length": 521.75, "epoch": 1.1421840354767183, "grad_norm": 0.0, "kl": 0.1250840276479721, "learning_rate": 4.405981630572849e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4121 }, { "completion_length": 557.5, "epoch": 1.1424611973392462, "grad_norm": 0.0, "kl": 0.1626339852809906, "learning_rate": 4.405698299955234e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4122 }, { "completion_length": 603.25, "epoch": 1.142738359201774, "grad_norm": 0.3393104672431946, "kl": 0.19161124527454376, "learning_rate": 4.405414910897778e-06, "loss": 0.0, "reward": 1.375, "reward_std": 0.75, "rewards/confident_score_func": -0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4123 }, { "completion_length": 619.5, "epoch": 1.1430155210643016, "grad_norm": 0.36711424589157104, "kl": 0.10479575395584106, "learning_rate": 4.405131463409174e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4124 }, { "completion_length": 705.5, "epoch": 1.1432926829268293, "grad_norm": 0.5792018175125122, "kl": 0.10182159394025803, "learning_rate": 4.404847957498113e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4125 }, { "completion_length": 604.0, "epoch": 1.143569844789357, "grad_norm": 0.3667278289794922, "kl": 0.13611149787902832, "learning_rate": 4.4045643931732894e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4126 }, { "completion_length": 623.0, "epoch": 1.1438470066518847, "grad_norm": 0.3689090609550476, "kl": 0.11482033133506775, "learning_rate": 4.404280770443398e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4127 }, { "completion_length": 551.0, "epoch": 1.1441241685144123, "grad_norm": 0.0, "kl": 0.126150980591774, "learning_rate": 4.403997089317138e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4128 }, { "completion_length": 643.0, "epoch": 1.1444013303769403, "grad_norm": 0.0, "kl": 0.09620723128318787, "learning_rate": 4.403713349803207e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4129 }, { "completion_length": 521.25, "epoch": 1.144678492239468, "grad_norm": 0.0, "kl": 0.10959164053201675, "learning_rate": 4.4034295519103065e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4130 }, { "completion_length": 522.5, "epoch": 1.1449556541019956, "grad_norm": 0.3468502163887024, "kl": 0.11577688157558441, "learning_rate": 4.4031456956471405e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4131 }, { "completion_length": 696.25, "epoch": 1.1452328159645233, "grad_norm": 0.29349780082702637, "kl": 0.08777763694524765, "learning_rate": 4.4028617810224125e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4132 }, { "completion_length": 716.25, "epoch": 1.145509977827051, "grad_norm": 0.25554224848747253, "kl": 1.105306625366211, "learning_rate": 4.402577808044829e-06, "loss": 0.0, "reward": 5.59375, "reward_std": 0.3125, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 4133 }, { "completion_length": 633.75, "epoch": 1.1457871396895787, "grad_norm": 0.25662606954574585, "kl": 0.10775107145309448, "learning_rate": 4.402293776723099e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4134 }, { "completion_length": 577.25, "epoch": 1.1460643015521064, "grad_norm": 0.3335363566875458, "kl": 0.11442297697067261, "learning_rate": 4.402009687065933e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4135 }, { "completion_length": 578.5, "epoch": 1.146341463414634, "grad_norm": 0.0, "kl": 0.09762299805879593, "learning_rate": 4.40172553908204e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4136 }, { "completion_length": 632.25, "epoch": 1.1466186252771617, "grad_norm": 0.0, "kl": 0.10775671154260635, "learning_rate": 4.401441332780137e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4137 }, { "completion_length": 646.5, "epoch": 1.1468957871396896, "grad_norm": 0.0, "kl": 0.15144482254981995, "learning_rate": 4.401157068168939e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4138 }, { "completion_length": 910.0, "epoch": 1.1471729490022173, "grad_norm": 0.32293233275413513, "kl": 0.09369003772735596, "learning_rate": 4.400872745257161e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4139 }, { "completion_length": 544.5, "epoch": 1.147450110864745, "grad_norm": 0.0, "kl": 0.10839065909385681, "learning_rate": 4.4005883640535244e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4140 }, { "completion_length": 630.5, "epoch": 1.1477272727272727, "grad_norm": 0.0, "kl": 0.3392886519432068, "learning_rate": 4.400303924566748e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4141 }, { "completion_length": 628.0, "epoch": 1.1480044345898004, "grad_norm": 0.0, "kl": 0.10317970812320709, "learning_rate": 4.4000194268055565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4142 }, { "completion_length": 628.75, "epoch": 1.148281596452328, "grad_norm": 0.3359503746032715, "kl": 0.11672715842723846, "learning_rate": 4.3997348707786715e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4143 }, { "completion_length": 664.25, "epoch": 1.1485587583148558, "grad_norm": 0.0, "kl": 0.09776812046766281, "learning_rate": 4.399450256494821e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4144 }, { "completion_length": 596.25, "epoch": 1.1488359201773837, "grad_norm": 0.3310320973396301, "kl": 0.13781607151031494, "learning_rate": 4.399165583962734e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4145 }, { "completion_length": 534.0, "epoch": 1.1491130820399114, "grad_norm": 0.0, "kl": 0.20255357027053833, "learning_rate": 4.3988808531911375e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4146 }, { "completion_length": 562.0, "epoch": 1.149390243902439, "grad_norm": 0.3445182740688324, "kl": 0.1014547348022461, "learning_rate": 4.3985960641887645e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4147 }, { "completion_length": 583.5, "epoch": 1.1496674057649667, "grad_norm": 0.33553922176361084, "kl": 0.1306297332048416, "learning_rate": 4.3983112169643495e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4148 }, { "completion_length": 634.5, "epoch": 1.1499445676274944, "grad_norm": 0.34732896089553833, "kl": 0.10091177374124527, "learning_rate": 4.398026311526625e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4149 }, { "completion_length": 691.25, "epoch": 1.150221729490022, "grad_norm": 0.26740431785583496, "kl": 0.167418971657753, "learning_rate": 4.397741347884329e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4150 }, { "completion_length": 701.75, "epoch": 1.1504988913525498, "grad_norm": 0.2745802700519562, "kl": 0.09543640166521072, "learning_rate": 4.397456326046201e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4151 }, { "completion_length": 607.5, "epoch": 1.1507760532150777, "grad_norm": 0.34481558203697205, "kl": 0.13396817445755005, "learning_rate": 4.39717124602098e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4152 }, { "completion_length": 727.75, "epoch": 1.1510532150776054, "grad_norm": 0.32936808466911316, "kl": 0.09905340522527695, "learning_rate": 4.3968861078174096e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4153 }, { "completion_length": 651.5, "epoch": 1.151330376940133, "grad_norm": 0.0, "kl": 0.11573386937379837, "learning_rate": 4.396600911444232e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4154 }, { "completion_length": 570.0, "epoch": 1.1516075388026608, "grad_norm": 0.0, "kl": 0.16562193632125854, "learning_rate": 4.396315656910196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4155 }, { "completion_length": 596.25, "epoch": 1.1518847006651884, "grad_norm": 0.0, "kl": 0.10905857384204865, "learning_rate": 4.396030344224046e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4156 }, { "completion_length": 598.25, "epoch": 1.1521618625277161, "grad_norm": 0.31836673617362976, "kl": 0.11725206673145294, "learning_rate": 4.395744973394532e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4157 }, { "completion_length": 644.75, "epoch": 1.1524390243902438, "grad_norm": 0.29586178064346313, "kl": 0.10330996662378311, "learning_rate": 4.395459544430407e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4158 }, { "completion_length": 655.5, "epoch": 1.1527161862527717, "grad_norm": 0.0, "kl": 0.11100944876670837, "learning_rate": 4.395174057340423e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4159 }, { "completion_length": 692.75, "epoch": 1.1529933481152994, "grad_norm": 0.5109776258468628, "kl": 0.09641733020544052, "learning_rate": 4.394888512133333e-06, "loss": 0.0, "reward": 5.71875, "reward_std": 0.0625, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 4160 }, { "completion_length": 618.75, "epoch": 1.153270509977827, "grad_norm": 0.0, "kl": 0.11062923073768616, "learning_rate": 4.394602908817896e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4161 }, { "completion_length": 598.0, "epoch": 1.1535476718403548, "grad_norm": 0.0, "kl": 0.094207264482975, "learning_rate": 4.394317247402868e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4162 }, { "completion_length": 554.5, "epoch": 1.1538248337028825, "grad_norm": 0.0, "kl": 0.12155413627624512, "learning_rate": 4.394031527897012e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4163 }, { "completion_length": 603.5, "epoch": 1.1541019955654102, "grad_norm": 0.0, "kl": 0.10336371511220932, "learning_rate": 4.393745750309087e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4164 }, { "completion_length": 601.75, "epoch": 1.1543791574279378, "grad_norm": 0.0, "kl": 0.10371702164411545, "learning_rate": 4.393459914647857e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4165 }, { "completion_length": 533.25, "epoch": 1.1546563192904657, "grad_norm": 0.0, "kl": 0.14055678248405457, "learning_rate": 4.3931740209220885e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4166 }, { "completion_length": 652.75, "epoch": 1.1549334811529934, "grad_norm": 0.0, "kl": 0.10973228514194489, "learning_rate": 4.392888069140549e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4167 }, { "completion_length": 910.75, "epoch": 1.1552106430155211, "grad_norm": 0.3922610878944397, "kl": 0.08946827799081802, "learning_rate": 4.392602059312005e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4168 }, { "completion_length": 589.0, "epoch": 1.1554878048780488, "grad_norm": 0.0, "kl": 0.103705994784832, "learning_rate": 4.39231599144523e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4169 }, { "completion_length": 561.25, "epoch": 1.1557649667405765, "grad_norm": 0.0, "kl": 0.10593605041503906, "learning_rate": 4.392029865548995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4170 }, { "completion_length": 544.5, "epoch": 1.1560421286031042, "grad_norm": 0.0, "kl": 0.15741725265979767, "learning_rate": 4.391743681632075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4171 }, { "completion_length": 601.25, "epoch": 1.1563192904656319, "grad_norm": 0.3933873176574707, "kl": 0.2035210132598877, "learning_rate": 4.391457439703245e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4172 }, { "completion_length": 601.5, "epoch": 1.1565964523281596, "grad_norm": 0.3381573259830475, "kl": 0.10327735543251038, "learning_rate": 4.3911711397712845e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4173 }, { "completion_length": 601.0, "epoch": 1.1568736141906872, "grad_norm": 0.0, "kl": 0.09396539628505707, "learning_rate": 4.390884781844972e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4174 }, { "completion_length": 516.0, "epoch": 1.1571507760532151, "grad_norm": 0.35300183296203613, "kl": 0.12193988263607025, "learning_rate": 4.390598365933089e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4175 }, { "completion_length": 524.75, "epoch": 1.1574279379157428, "grad_norm": 0.0, "kl": 0.14816774427890778, "learning_rate": 4.390311892044418e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4176 }, { "completion_length": 537.75, "epoch": 1.1577050997782705, "grad_norm": 0.37528419494628906, "kl": 0.11736268550157547, "learning_rate": 4.390025360187746e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4177 }, { "completion_length": 689.5, "epoch": 1.1579822616407982, "grad_norm": 0.2834177315235138, "kl": 0.10054581612348557, "learning_rate": 4.389738770371858e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4178 }, { "completion_length": 507.75, "epoch": 1.158259423503326, "grad_norm": 0.0, "kl": 0.1224546954035759, "learning_rate": 4.389452122605543e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4179 }, { "completion_length": 545.0, "epoch": 1.1585365853658536, "grad_norm": 0.0, "kl": 0.11794005334377289, "learning_rate": 4.389165416897592e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4180 }, { "completion_length": 541.75, "epoch": 1.1588137472283813, "grad_norm": 0.40516847372055054, "kl": 0.10490527004003525, "learning_rate": 4.388878653256795e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4181 }, { "completion_length": 623.0, "epoch": 1.1590909090909092, "grad_norm": 0.27402693033218384, "kl": 0.11676210165023804, "learning_rate": 4.388591831691948e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4182 }, { "completion_length": 641.25, "epoch": 1.1593680709534369, "grad_norm": 0.0, "kl": 0.11547393351793289, "learning_rate": 4.388304952211846e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4183 }, { "completion_length": 484.25, "epoch": 1.1596452328159645, "grad_norm": 0.0, "kl": 0.13138048350811005, "learning_rate": 4.388018014825287e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4184 }, { "completion_length": 543.25, "epoch": 1.1599223946784922, "grad_norm": 0.0, "kl": 0.13621188700199127, "learning_rate": 4.387731019541068e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4185 }, { "completion_length": 563.75, "epoch": 1.16019955654102, "grad_norm": 0.32862743735313416, "kl": 0.12462703883647919, "learning_rate": 4.387443966367993e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4186 }, { "completion_length": 516.75, "epoch": 1.1604767184035476, "grad_norm": 0.0, "kl": 0.13287390768527985, "learning_rate": 4.387156855314862e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4187 }, { "completion_length": 431.0, "epoch": 1.1607538802660753, "grad_norm": 0.0, "kl": 0.1679127812385559, "learning_rate": 4.38686968639048e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4188 }, { "completion_length": 576.25, "epoch": 1.1610310421286032, "grad_norm": 0.0, "kl": 0.0990133136510849, "learning_rate": 4.386582459603655e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4189 }, { "completion_length": 547.25, "epoch": 1.1613082039911309, "grad_norm": 0.0, "kl": 0.12086954712867737, "learning_rate": 4.386295174963193e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4190 }, { "completion_length": 761.0, "epoch": 1.1615853658536586, "grad_norm": 0.0, "kl": 0.1117449402809143, "learning_rate": 4.386007832477906e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4191 }, { "completion_length": 535.0, "epoch": 1.1618625277161863, "grad_norm": 0.0, "kl": 0.12667414546012878, "learning_rate": 4.3857204321566035e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4192 }, { "completion_length": 662.75, "epoch": 1.162139689578714, "grad_norm": 0.0, "kl": 0.12209399044513702, "learning_rate": 4.3854329740081e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4193 }, { "completion_length": 669.25, "epoch": 1.1624168514412416, "grad_norm": 0.0, "kl": 0.11452281475067139, "learning_rate": 4.38514545804121e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4194 }, { "completion_length": 517.75, "epoch": 1.1626940133037693, "grad_norm": 0.0, "kl": 0.11250210553407669, "learning_rate": 4.384857884264751e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4195 }, { "completion_length": 653.0, "epoch": 1.1629711751662972, "grad_norm": 0.3102944493293762, "kl": 0.10072600841522217, "learning_rate": 4.384570252687542e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4196 }, { "completion_length": 523.75, "epoch": 1.163248337028825, "grad_norm": 0.0, "kl": 0.1258031725883484, "learning_rate": 4.384282563318403e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4197 }, { "completion_length": 655.75, "epoch": 1.1635254988913526, "grad_norm": 0.0, "kl": 0.10392246395349503, "learning_rate": 4.383994816166156e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4198 }, { "completion_length": 664.25, "epoch": 1.1638026607538803, "grad_norm": 0.3123323321342468, "kl": 0.09513253718614578, "learning_rate": 4.383707011239625e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4199 }, { "completion_length": 511.75, "epoch": 1.164079822616408, "grad_norm": 0.3381531238555908, "kl": 0.17092645168304443, "learning_rate": 4.383419148547636e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4200 }, { "completion_length": 630.25, "epoch": 1.1643569844789357, "grad_norm": 0.318454772233963, "kl": 0.1318209022283554, "learning_rate": 4.383131228099017e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4201 }, { "completion_length": 587.0, "epoch": 1.1646341463414633, "grad_norm": 0.0, "kl": 0.23533102869987488, "learning_rate": 4.382843249902596e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4202 }, { "completion_length": 554.0, "epoch": 1.1649113082039912, "grad_norm": 0.3623805642127991, "kl": 0.12552863359451294, "learning_rate": 4.382555213967206e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4203 }, { "completion_length": 563.0, "epoch": 1.165188470066519, "grad_norm": 0.0, "kl": 0.14068470895290375, "learning_rate": 4.382267120301679e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4204 }, { "completion_length": 649.75, "epoch": 1.1654656319290466, "grad_norm": 0.32880377769470215, "kl": 0.10892447084188461, "learning_rate": 4.381978968914849e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4205 }, { "completion_length": 662.0, "epoch": 1.1657427937915743, "grad_norm": 0.0, "kl": 0.09786336869001389, "learning_rate": 4.381690759815552e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4206 }, { "completion_length": 577.75, "epoch": 1.166019955654102, "grad_norm": 0.0, "kl": 0.12332262843847275, "learning_rate": 4.381402493012627e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4207 }, { "completion_length": 561.0, "epoch": 1.1662971175166297, "grad_norm": 0.0, "kl": 0.13553354144096375, "learning_rate": 4.381114168514915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4208 }, { "completion_length": 563.25, "epoch": 1.1665742793791574, "grad_norm": 0.0, "kl": 0.11029462516307831, "learning_rate": 4.380825786331257e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4209 }, { "completion_length": 566.75, "epoch": 1.166851441241685, "grad_norm": 0.41209644079208374, "kl": 0.15719883143901825, "learning_rate": 4.380537346470495e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4210 }, { "completion_length": 660.75, "epoch": 1.1671286031042127, "grad_norm": 0.0, "kl": 0.12755517661571503, "learning_rate": 4.3802488489414764e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4211 }, { "completion_length": 652.0, "epoch": 1.1674057649667406, "grad_norm": 0.0, "kl": 0.10574567317962646, "learning_rate": 4.3799602937530464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4212 }, { "completion_length": 565.25, "epoch": 1.1676829268292683, "grad_norm": 0.0, "kl": 0.12192897498607635, "learning_rate": 4.379671680914055e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4213 }, { "completion_length": 679.5, "epoch": 1.167960088691796, "grad_norm": 0.0, "kl": 0.10571157932281494, "learning_rate": 4.379383010433352e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4214 }, { "completion_length": 469.5, "epoch": 1.1682372505543237, "grad_norm": 0.37513625621795654, "kl": 0.13702835142612457, "learning_rate": 4.379094282319791e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4215 }, { "completion_length": 546.75, "epoch": 1.1685144124168514, "grad_norm": 0.0, "kl": 0.11995381861925125, "learning_rate": 4.378805496582225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4216 }, { "completion_length": 570.0, "epoch": 1.168791574279379, "grad_norm": 0.0, "kl": 0.14459985494613647, "learning_rate": 4.378516653229509e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4217 }, { "completion_length": 598.0, "epoch": 1.1690687361419068, "grad_norm": 0.0, "kl": 0.12774381041526794, "learning_rate": 4.378227752270503e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4218 }, { "completion_length": 557.75, "epoch": 1.1693458980044347, "grad_norm": 0.33305659890174866, "kl": 0.10568790882825851, "learning_rate": 4.377938793714064e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4219 }, { "completion_length": 551.0, "epoch": 1.1696230598669624, "grad_norm": 0.36596304178237915, "kl": 0.1946587860584259, "learning_rate": 4.377649777569055e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4220 }, { "completion_length": 578.0, "epoch": 1.16990022172949, "grad_norm": 0.3762837052345276, "kl": 0.17725388705730438, "learning_rate": 4.377360703844338e-06, "loss": 0.0, "reward": 1.5625, "reward_std": 0.375, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 4221 }, { "completion_length": 521.5, "epoch": 1.1701773835920177, "grad_norm": 0.3808498680591583, "kl": 0.2793045938014984, "learning_rate": 4.377071572548778e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4222 }, { "completion_length": 507.0, "epoch": 1.1704545454545454, "grad_norm": 0.0, "kl": 0.13791467249393463, "learning_rate": 4.37678238369124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4223 }, { "completion_length": 498.75, "epoch": 1.170731707317073, "grad_norm": 0.0, "kl": 0.1742064654827118, "learning_rate": 4.376493137280595e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4224 }, { "completion_length": 544.75, "epoch": 1.1710088691796008, "grad_norm": 0.3808997571468353, "kl": 0.1333262026309967, "learning_rate": 4.376203833325711e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4225 }, { "completion_length": 472.75, "epoch": 1.1712860310421287, "grad_norm": 0.0, "kl": 0.11787128448486328, "learning_rate": 4.37591447183546e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4226 }, { "completion_length": 543.25, "epoch": 1.1715631929046564, "grad_norm": 0.0, "kl": 0.1317693293094635, "learning_rate": 4.375625052818716e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4227 }, { "completion_length": 522.5, "epoch": 1.171840354767184, "grad_norm": 0.0, "kl": 0.13690587878227234, "learning_rate": 4.375335576284355e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4228 }, { "completion_length": 460.5, "epoch": 1.1721175166297118, "grad_norm": 0.0, "kl": 0.8505750298500061, "learning_rate": 4.375046042241252e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4229 }, { "completion_length": 620.25, "epoch": 1.1723946784922394, "grad_norm": 0.3321506977081299, "kl": 0.12058330327272415, "learning_rate": 4.374756450698287e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4230 }, { "completion_length": 500.0, "epoch": 1.1726718403547671, "grad_norm": 0.0, "kl": 0.11384192854166031, "learning_rate": 4.37446680166434e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4231 }, { "completion_length": 876.0, "epoch": 1.1729490022172948, "grad_norm": 0.212201789021492, "kl": 0.10337945073843002, "learning_rate": 4.374177095148295e-06, "loss": 0.0, "reward": 4.71875, "reward_std": 2.0625, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 4232 }, { "completion_length": 591.0, "epoch": 1.1732261640798227, "grad_norm": 0.36266377568244934, "kl": 0.13010810315608978, "learning_rate": 4.3738873311590335e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4233 }, { "completion_length": 587.0, "epoch": 1.1735033259423504, "grad_norm": 0.0, "kl": 0.09707208722829819, "learning_rate": 4.373597509705444e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4234 }, { "completion_length": 555.75, "epoch": 1.173780487804878, "grad_norm": 0.0, "kl": 0.15613827109336853, "learning_rate": 4.373307630796412e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4235 }, { "completion_length": 617.25, "epoch": 1.1740576496674058, "grad_norm": 0.0, "kl": 0.10556985437870026, "learning_rate": 4.373017694440828e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4236 }, { "completion_length": 468.75, "epoch": 1.1743348115299335, "grad_norm": 0.3605201542377472, "kl": 0.11799760162830353, "learning_rate": 4.372727700647583e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4237 }, { "completion_length": 589.5, "epoch": 1.1746119733924612, "grad_norm": 0.0, "kl": 0.11442489176988602, "learning_rate": 4.372437649425569e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4238 }, { "completion_length": 578.75, "epoch": 1.1748891352549888, "grad_norm": 0.0, "kl": 0.12396178394556046, "learning_rate": 4.372147540783683e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4239 }, { "completion_length": 578.0, "epoch": 1.1751662971175167, "grad_norm": 0.0, "kl": 0.12307630479335785, "learning_rate": 4.371857374730818e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4240 }, { "completion_length": 603.25, "epoch": 1.1754434589800444, "grad_norm": 0.0, "kl": 0.13784655928611755, "learning_rate": 4.371567151275875e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4241 }, { "completion_length": 675.25, "epoch": 1.1757206208425721, "grad_norm": 0.0, "kl": 0.12332746386528015, "learning_rate": 4.3712768704277535e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4242 }, { "completion_length": 584.75, "epoch": 1.1759977827050998, "grad_norm": 0.0, "kl": 0.12773416936397552, "learning_rate": 4.370986532195354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4243 }, { "completion_length": 592.0, "epoch": 1.1762749445676275, "grad_norm": 0.0, "kl": 0.11621320992708206, "learning_rate": 4.370696136587581e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4244 }, { "completion_length": 594.75, "epoch": 1.1765521064301552, "grad_norm": 0.0, "kl": 0.14733922481536865, "learning_rate": 4.370405683613339e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4245 }, { "completion_length": 664.5, "epoch": 1.1768292682926829, "grad_norm": 0.0, "kl": 0.10947874933481216, "learning_rate": 4.370115173281536e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4246 }, { "completion_length": 576.25, "epoch": 1.1771064301552105, "grad_norm": 0.0, "kl": 0.13336296379566193, "learning_rate": 4.3698246056010794e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4247 }, { "completion_length": 647.75, "epoch": 1.1773835920177385, "grad_norm": 0.3853812515735626, "kl": 0.12795256078243256, "learning_rate": 4.36953398058088e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4248 }, { "completion_length": 644.25, "epoch": 1.1776607538802661, "grad_norm": 0.0, "kl": 0.12270086258649826, "learning_rate": 4.3692432982298515e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4249 }, { "completion_length": 603.0, "epoch": 1.1779379157427938, "grad_norm": 0.0, "kl": 0.10151936858892441, "learning_rate": 4.368952558556907e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4250 }, { "completion_length": 621.5, "epoch": 1.1782150776053215, "grad_norm": 0.0, "kl": 0.1754736304283142, "learning_rate": 4.368661761570961e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4251 }, { "completion_length": 591.75, "epoch": 1.1784922394678492, "grad_norm": 0.36244621872901917, "kl": 0.11197128146886826, "learning_rate": 4.368370907280933e-06, "loss": -0.0, "reward": 3.59375, "reward_std": 2.144214630126953, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 4252 }, { "completion_length": 489.75, "epoch": 1.1787694013303769, "grad_norm": 0.0, "kl": 0.12317078560590744, "learning_rate": 4.368079995695742e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4253 }, { "completion_length": 621.0, "epoch": 1.1790465631929046, "grad_norm": 0.0, "kl": 0.11255237460136414, "learning_rate": 4.3677890268243085e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4254 }, { "completion_length": 642.5, "epoch": 1.1793237250554323, "grad_norm": 0.0, "kl": 0.15068574249744415, "learning_rate": 4.367498000675555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4255 }, { "completion_length": 700.25, "epoch": 1.1796008869179602, "grad_norm": 0.0, "kl": 0.10874231904745102, "learning_rate": 4.367206917258407e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4256 }, { "completion_length": 540.25, "epoch": 1.1798780487804879, "grad_norm": 0.0, "kl": 0.14008982479572296, "learning_rate": 4.366915776581788e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4257 }, { "completion_length": 644.75, "epoch": 1.1801552106430155, "grad_norm": 0.0, "kl": 0.11713873594999313, "learning_rate": 4.36662457865463e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4258 }, { "completion_length": 544.75, "epoch": 1.1804323725055432, "grad_norm": 0.3855811357498169, "kl": 0.13032189011573792, "learning_rate": 4.366333323485862e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4259 }, { "completion_length": 554.25, "epoch": 1.180709534368071, "grad_norm": 0.0, "kl": 0.1267942190170288, "learning_rate": 4.366042011084414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4260 }, { "completion_length": 545.0, "epoch": 1.1809866962305986, "grad_norm": 0.0, "kl": 0.12649044394493103, "learning_rate": 4.365750641459219e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4261 }, { "completion_length": 551.5, "epoch": 1.1812638580931263, "grad_norm": 0.0, "kl": 0.12682373821735382, "learning_rate": 4.3654592146192146e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4262 }, { "completion_length": 610.0, "epoch": 1.1815410199556542, "grad_norm": 0.0, "kl": 0.1157684326171875, "learning_rate": 4.365167730573335e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4263 }, { "completion_length": 641.5, "epoch": 1.1818181818181819, "grad_norm": 0.31960952281951904, "kl": 0.10649292916059494, "learning_rate": 4.364876189330521e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4264 }, { "completion_length": 622.25, "epoch": 1.1820953436807096, "grad_norm": 0.0, "kl": 0.10295964032411575, "learning_rate": 4.3645845908997115e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4265 }, { "completion_length": 542.5, "epoch": 1.1823725055432373, "grad_norm": 0.0, "kl": 0.139785498380661, "learning_rate": 4.364292935289849e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4266 }, { "completion_length": 663.0, "epoch": 1.182649667405765, "grad_norm": 0.0, "kl": 0.10843531042337418, "learning_rate": 4.364001222509877e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4267 }, { "completion_length": 528.0, "epoch": 1.1829268292682926, "grad_norm": 0.3413810431957245, "kl": 0.14322315156459808, "learning_rate": 4.363709452568743e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4268 }, { "completion_length": 690.25, "epoch": 1.1832039911308203, "grad_norm": 0.0, "kl": 0.1051672175526619, "learning_rate": 4.363417625475392e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4269 }, { "completion_length": 574.75, "epoch": 1.1834811529933482, "grad_norm": 0.33990898728370667, "kl": 0.10652685165405273, "learning_rate": 4.363125741238774e-06, "loss": 0.0, "reward": 4.8125, "reward_std": 1.875, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 4270 }, { "completion_length": 544.5, "epoch": 1.183758314855876, "grad_norm": 0.4096218943595886, "kl": 0.10780074447393417, "learning_rate": 4.36283379986784e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4271 }, { "completion_length": 502.0, "epoch": 1.1840354767184036, "grad_norm": 0.0, "kl": 0.12795275449752808, "learning_rate": 4.362541801371542e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4272 }, { "completion_length": 580.25, "epoch": 1.1843126385809313, "grad_norm": 0.0, "kl": 0.12904873490333557, "learning_rate": 4.3622497457588355e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4273 }, { "completion_length": 677.75, "epoch": 1.184589800443459, "grad_norm": 0.304269015789032, "kl": 0.09971894323825836, "learning_rate": 4.361957633038676e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4274 }, { "completion_length": 591.0, "epoch": 1.1848669623059866, "grad_norm": 0.0, "kl": 0.18036742508411407, "learning_rate": 4.361665463220023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4275 }, { "completion_length": 545.5, "epoch": 1.1851441241685143, "grad_norm": 0.3652897775173187, "kl": 0.3714126944541931, "learning_rate": 4.361373236311832e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4276 }, { "completion_length": 601.5, "epoch": 1.1854212860310422, "grad_norm": 0.33078882098197937, "kl": 0.10809363424777985, "learning_rate": 4.361080952323068e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4277 }, { "completion_length": 558.0, "epoch": 1.18569844789357, "grad_norm": 0.0, "kl": 0.28536051511764526, "learning_rate": 4.360788611262694e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4278 }, { "completion_length": 572.5, "epoch": 1.1859756097560976, "grad_norm": 0.0, "kl": 0.10858897864818573, "learning_rate": 4.360496213139673e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4279 }, { "completion_length": 574.5, "epoch": 1.1862527716186253, "grad_norm": 0.0, "kl": 0.13034695386886597, "learning_rate": 4.3602037579629724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4280 }, { "completion_length": 578.5, "epoch": 1.186529933481153, "grad_norm": 0.3144787549972534, "kl": 0.11725015193223953, "learning_rate": 4.359911245741562e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4281 }, { "completion_length": 662.75, "epoch": 1.1868070953436807, "grad_norm": 0.296891450881958, "kl": 0.1318640261888504, "learning_rate": 4.35961867648441e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4282 }, { "completion_length": 521.75, "epoch": 1.1870842572062084, "grad_norm": 0.0, "kl": 0.11951422691345215, "learning_rate": 4.359326050200488e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4283 }, { "completion_length": 511.25, "epoch": 1.187361419068736, "grad_norm": 0.3900241255760193, "kl": 0.15174275636672974, "learning_rate": 4.359033366898772e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4284 }, { "completion_length": 545.5, "epoch": 1.187638580931264, "grad_norm": 0.0, "kl": 0.10416014492511749, "learning_rate": 4.358740626588235e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4285 }, { "completion_length": 623.0, "epoch": 1.1879157427937916, "grad_norm": 0.31798696517944336, "kl": 0.1072014644742012, "learning_rate": 4.358447829277856e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4286 }, { "completion_length": 568.25, "epoch": 1.1881929046563193, "grad_norm": 0.0, "kl": 0.14634151756763458, "learning_rate": 4.3581549749766125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4287 }, { "completion_length": 477.25, "epoch": 1.188470066518847, "grad_norm": 0.4210139811038971, "kl": 0.14579418301582336, "learning_rate": 4.357862063693486e-06, "loss": 0.0, "reward": 1.71875, "reward_std": 0.0625, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 4288 }, { "completion_length": 638.0, "epoch": 1.1887472283813747, "grad_norm": 0.3679218590259552, "kl": 0.11865728348493576, "learning_rate": 4.3575690954374584e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4289 }, { "completion_length": 543.5, "epoch": 1.1890243902439024, "grad_norm": 0.0, "kl": 0.1210751011967659, "learning_rate": 4.357276070217514e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4290 }, { "completion_length": 610.5, "epoch": 1.18930155210643, "grad_norm": 0.3924550414085388, "kl": 0.10106074810028076, "learning_rate": 4.3569829880426384e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4291 }, { "completion_length": 485.75, "epoch": 1.1895787139689578, "grad_norm": 0.0, "kl": 0.11640654504299164, "learning_rate": 4.35668984892182e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4292 }, { "completion_length": 494.5, "epoch": 1.1898558758314857, "grad_norm": 0.0, "kl": 0.14874611794948578, "learning_rate": 4.356396652864047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4293 }, { "completion_length": 544.25, "epoch": 1.1901330376940134, "grad_norm": 0.0, "kl": 0.1179656982421875, "learning_rate": 4.356103399878311e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4294 }, { "completion_length": 507.5, "epoch": 1.190410199556541, "grad_norm": 0.0, "kl": 0.11981639266014099, "learning_rate": 4.3558100899736054e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4295 }, { "completion_length": 482.5, "epoch": 1.1906873614190687, "grad_norm": 0.0, "kl": 0.12556293606758118, "learning_rate": 4.355516723158924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4296 }, { "completion_length": 486.75, "epoch": 1.1909645232815964, "grad_norm": 0.0, "kl": 0.12398289889097214, "learning_rate": 4.3552232994432635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4297 }, { "completion_length": 576.25, "epoch": 1.191241685144124, "grad_norm": 0.0, "kl": 0.4313543140888214, "learning_rate": 4.354929818835622e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4298 }, { "completion_length": 601.0, "epoch": 1.1915188470066518, "grad_norm": 0.3629177212715149, "kl": 0.1094207614660263, "learning_rate": 4.3546362813449995e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4299 }, { "completion_length": 658.25, "epoch": 1.1917960088691797, "grad_norm": 0.0, "kl": 0.12842649221420288, "learning_rate": 4.354342686980397e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4300 }, { "completion_length": 513.75, "epoch": 1.1920731707317074, "grad_norm": 0.0, "kl": 0.1269051432609558, "learning_rate": 4.354049035750818e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4301 }, { "completion_length": 586.5, "epoch": 1.192350332594235, "grad_norm": 0.29618024826049805, "kl": 0.13865597546100616, "learning_rate": 4.353755327665268e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4302 }, { "completion_length": 584.75, "epoch": 1.1926274944567627, "grad_norm": 0.0, "kl": 0.12999384105205536, "learning_rate": 4.353461562732754e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4303 }, { "completion_length": 568.0, "epoch": 1.1929046563192904, "grad_norm": 0.0, "kl": 0.13371196389198303, "learning_rate": 4.353167740962283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4304 }, { "completion_length": 541.25, "epoch": 1.1931818181818181, "grad_norm": 0.3427077531814575, "kl": 0.14883752167224884, "learning_rate": 4.352873862362868e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4305 }, { "completion_length": 521.5, "epoch": 1.1934589800443458, "grad_norm": 0.3799077868461609, "kl": 0.12603744864463806, "learning_rate": 4.3525799269435175e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4306 }, { "completion_length": 520.75, "epoch": 1.1937361419068737, "grad_norm": 0.0, "kl": 0.09205064177513123, "learning_rate": 4.352285934713248e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4307 }, { "completion_length": 505.25, "epoch": 1.1940133037694014, "grad_norm": 0.0, "kl": 0.09396573156118393, "learning_rate": 4.351991885681075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4308 }, { "completion_length": 578.5, "epoch": 1.194290465631929, "grad_norm": 0.0, "kl": 0.12403213977813721, "learning_rate": 4.351697779856013e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4309 }, { "completion_length": 498.25, "epoch": 1.1945676274944568, "grad_norm": 0.0, "kl": 0.12457063794136047, "learning_rate": 4.351403617247085e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4310 }, { "completion_length": 574.5, "epoch": 1.1948447893569845, "grad_norm": 0.0, "kl": 0.10207358747720718, "learning_rate": 4.351109397863309e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4311 }, { "completion_length": 640.25, "epoch": 1.1951219512195121, "grad_norm": 0.30025923252105713, "kl": 0.10912256687879562, "learning_rate": 4.350815121713708e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4312 }, { "completion_length": 550.75, "epoch": 1.1953991130820398, "grad_norm": 0.0, "kl": 0.1078895702958107, "learning_rate": 4.350520788807307e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4313 }, { "completion_length": 625.25, "epoch": 1.1956762749445677, "grad_norm": 0.30219826102256775, "kl": 0.1795376092195511, "learning_rate": 4.35022639915313e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4314 }, { "completion_length": 547.0, "epoch": 1.1959534368070954, "grad_norm": 0.367352157831192, "kl": 0.12217161804437637, "learning_rate": 4.3499319527602065e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4315 }, { "completion_length": 604.25, "epoch": 1.1962305986696231, "grad_norm": 0.0, "kl": 0.24046503007411957, "learning_rate": 4.349637449637566e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4316 }, { "completion_length": 584.0, "epoch": 1.1965077605321508, "grad_norm": 0.0, "kl": 0.16919711232185364, "learning_rate": 4.34934288979424e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4317 }, { "completion_length": 654.75, "epoch": 1.1967849223946785, "grad_norm": 0.0, "kl": 0.1491086781024933, "learning_rate": 4.34904827323926e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4318 }, { "completion_length": 569.75, "epoch": 1.1970620842572062, "grad_norm": 0.39788419008255005, "kl": 0.11439143121242523, "learning_rate": 4.348753599981661e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4319 }, { "completion_length": 587.75, "epoch": 1.1973392461197339, "grad_norm": 0.0, "kl": 0.1041402816772461, "learning_rate": 4.34845887003048e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4320 }, { "completion_length": 509.5, "epoch": 1.1976164079822615, "grad_norm": 0.0, "kl": 0.16356809437274933, "learning_rate": 4.348164083394755e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4321 }, { "completion_length": 446.5, "epoch": 1.1978935698447895, "grad_norm": 0.0, "kl": 0.15178482234477997, "learning_rate": 4.347869240083525e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4322 }, { "completion_length": 591.0, "epoch": 1.1981707317073171, "grad_norm": 0.0, "kl": 0.11202647536993027, "learning_rate": 4.347574340105833e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4323 }, { "completion_length": 503.5, "epoch": 1.1984478935698448, "grad_norm": 0.0, "kl": 0.14562687277793884, "learning_rate": 4.347279383470722e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4324 }, { "completion_length": 590.5, "epoch": 1.1987250554323725, "grad_norm": 0.0, "kl": 0.1364283710718155, "learning_rate": 4.346984370187236e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4325 }, { "completion_length": 551.25, "epoch": 1.1990022172949002, "grad_norm": 0.6464191675186157, "kl": 0.15771685540676117, "learning_rate": 4.3466893002644225e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4326 }, { "completion_length": 524.0, "epoch": 1.1992793791574279, "grad_norm": 0.0, "kl": 0.13471876084804535, "learning_rate": 4.346394173711331e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4327 }, { "completion_length": 625.25, "epoch": 1.1995565410199556, "grad_norm": 0.0, "kl": 0.1469103991985321, "learning_rate": 4.34609899053701e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4328 }, { "completion_length": 544.75, "epoch": 1.1998337028824833, "grad_norm": 0.0, "kl": 0.13172177970409393, "learning_rate": 4.345803750750514e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4329 }, { "completion_length": 527.75, "epoch": 1.2001108647450112, "grad_norm": 0.4067454934120178, "kl": 0.12760429084300995, "learning_rate": 4.345508454360894e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4330 }, { "completion_length": 562.25, "epoch": 1.2003880266075388, "grad_norm": 0.0, "kl": 0.11345820873975754, "learning_rate": 4.345213101377208e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4331 }, { "completion_length": 504.75, "epoch": 1.2006651884700665, "grad_norm": 0.4055366814136505, "kl": 0.1453641653060913, "learning_rate": 4.344917691808511e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4332 }, { "completion_length": 590.0, "epoch": 1.2009423503325942, "grad_norm": 0.0, "kl": 0.10719301551580429, "learning_rate": 4.344622225663864e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4333 }, { "completion_length": 480.5, "epoch": 1.201219512195122, "grad_norm": 0.0, "kl": 0.24700577557086945, "learning_rate": 4.3443267029523265e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4334 }, { "completion_length": 519.5, "epoch": 1.2014966740576496, "grad_norm": 0.0, "kl": 0.11409854143857956, "learning_rate": 4.34403112368296e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4335 }, { "completion_length": 589.25, "epoch": 1.2017738359201773, "grad_norm": 0.0, "kl": 0.15380777418613434, "learning_rate": 4.343735487864831e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4336 }, { "completion_length": 517.25, "epoch": 1.2020509977827052, "grad_norm": 0.0, "kl": 0.12114152312278748, "learning_rate": 4.3434397955070045e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4337 }, { "completion_length": 541.5, "epoch": 1.2023281596452329, "grad_norm": 0.40691232681274414, "kl": 0.1418658047914505, "learning_rate": 4.343144046618547e-06, "loss": 0.0, "reward": 4.84375, "reward_std": 1.8125, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 4338 }, { "completion_length": 643.5, "epoch": 1.2026053215077606, "grad_norm": 0.0, "kl": 0.11227758228778839, "learning_rate": 4.34284824120853e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4339 }, { "completion_length": 570.0, "epoch": 1.2028824833702882, "grad_norm": 0.3885706961154938, "kl": 0.11095278710126877, "learning_rate": 4.3425523792860234e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4340 }, { "completion_length": 602.5, "epoch": 1.203159645232816, "grad_norm": 0.4098140597343445, "kl": 0.11163926124572754, "learning_rate": 4.3422564608601e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4341 }, { "completion_length": 596.5, "epoch": 1.2034368070953436, "grad_norm": 0.0, "kl": 0.12178032845258713, "learning_rate": 4.3419604859398345e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4342 }, { "completion_length": 577.5, "epoch": 1.2037139689578713, "grad_norm": 0.0, "kl": 0.19976651668548584, "learning_rate": 4.341664454534303e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4343 }, { "completion_length": 605.75, "epoch": 1.2039911308203992, "grad_norm": 0.31542515754699707, "kl": 0.11389478296041489, "learning_rate": 4.341368366652584e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4344 }, { "completion_length": 504.75, "epoch": 1.204268292682927, "grad_norm": 0.4155137240886688, "kl": 0.12792561948299408, "learning_rate": 4.341072222303757e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4345 }, { "completion_length": 594.0, "epoch": 1.2045454545454546, "grad_norm": 0.0, "kl": 0.13858947157859802, "learning_rate": 4.340776021496904e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4346 }, { "completion_length": 589.25, "epoch": 1.2048226164079823, "grad_norm": 0.0, "kl": 0.13081508874893188, "learning_rate": 4.340479764241107e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4347 }, { "completion_length": 586.0, "epoch": 1.20509977827051, "grad_norm": 0.0, "kl": 0.10898079723119736, "learning_rate": 4.340183450545453e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4348 }, { "completion_length": 675.0, "epoch": 1.2053769401330376, "grad_norm": 0.0, "kl": 0.0886986181139946, "learning_rate": 4.339887080419027e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4349 }, { "completion_length": 593.5, "epoch": 1.2056541019955653, "grad_norm": 0.0, "kl": 0.10744606703519821, "learning_rate": 4.339590653870917e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4350 }, { "completion_length": 597.75, "epoch": 1.2059312638580932, "grad_norm": 0.0, "kl": 0.1336802840232849, "learning_rate": 4.339294170910216e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4351 }, { "completion_length": 571.0, "epoch": 1.206208425720621, "grad_norm": 0.0, "kl": 0.12164979428052902, "learning_rate": 4.3389976315460125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4352 }, { "completion_length": 697.75, "epoch": 1.2064855875831486, "grad_norm": 0.0, "kl": 0.20551881194114685, "learning_rate": 4.338701035787403e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4353 }, { "completion_length": 556.75, "epoch": 1.2067627494456763, "grad_norm": 0.0, "kl": 0.12755559384822845, "learning_rate": 4.33840438364348e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4354 }, { "completion_length": 601.25, "epoch": 1.207039911308204, "grad_norm": 0.0, "kl": 0.11782500147819519, "learning_rate": 4.338107675123343e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4355 }, { "completion_length": 569.75, "epoch": 1.2073170731707317, "grad_norm": 0.3128695487976074, "kl": 0.10401472449302673, "learning_rate": 4.33781091023609e-06, "loss": -0.0, "reward": 2.8125, "reward_std": 1.9618761539459229, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 4356 }, { "completion_length": 517.0, "epoch": 1.2075942350332594, "grad_norm": 0.35158196091651917, "kl": 0.14223524928092957, "learning_rate": 4.337514088990822e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4357 }, { "completion_length": 629.5, "epoch": 1.2078713968957873, "grad_norm": 0.0, "kl": 0.11378850787878036, "learning_rate": 4.33721721139664e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4358 }, { "completion_length": 618.75, "epoch": 1.208148558758315, "grad_norm": 0.34100499749183655, "kl": 0.11805912107229233, "learning_rate": 4.336920277462649e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4359 }, { "completion_length": 573.5, "epoch": 1.2084257206208426, "grad_norm": 0.406573086977005, "kl": 0.12196256965398788, "learning_rate": 4.336623287197954e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4360 }, { "completion_length": 513.0, "epoch": 1.2087028824833703, "grad_norm": 0.0, "kl": 0.1316055804491043, "learning_rate": 4.3363262406116634e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4361 }, { "completion_length": 562.25, "epoch": 1.208980044345898, "grad_norm": 0.3644918203353882, "kl": 0.11504177004098892, "learning_rate": 4.3360291377128864e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4362 }, { "completion_length": 545.5, "epoch": 1.2092572062084257, "grad_norm": 0.0, "kl": 0.11505597084760666, "learning_rate": 4.3357319785107325e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4363 }, { "completion_length": 539.25, "epoch": 1.2095343680709534, "grad_norm": 0.0, "kl": 0.10958970338106155, "learning_rate": 4.335434763014316e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4364 }, { "completion_length": 582.75, "epoch": 1.209811529933481, "grad_norm": 0.0, "kl": 0.10257399082183838, "learning_rate": 4.33513749123275e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4365 }, { "completion_length": 586.25, "epoch": 1.2100886917960088, "grad_norm": 0.30364683270454407, "kl": 0.10998231917619705, "learning_rate": 4.334840163175152e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4366 }, { "completion_length": 569.0, "epoch": 1.2103658536585367, "grad_norm": 0.31040143966674805, "kl": 0.10691911727190018, "learning_rate": 4.334542778850638e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4367 }, { "completion_length": 577.75, "epoch": 1.2106430155210643, "grad_norm": 0.0, "kl": 0.11671479046344757, "learning_rate": 4.334245338268329e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4368 }, { "completion_length": 540.25, "epoch": 1.210920177383592, "grad_norm": 0.44218313694000244, "kl": 0.10876988619565964, "learning_rate": 4.333947841437345e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4369 }, { "completion_length": 476.75, "epoch": 1.2111973392461197, "grad_norm": 0.3548015058040619, "kl": 0.14136666059494019, "learning_rate": 4.33365028836681e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4370 }, { "completion_length": 592.75, "epoch": 1.2114745011086474, "grad_norm": 0.35561370849609375, "kl": 0.19484005868434906, "learning_rate": 4.333352679065849e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4371 }, { "completion_length": 443.25, "epoch": 1.211751662971175, "grad_norm": 0.0, "kl": 0.11166413128376007, "learning_rate": 4.333055013543588e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4372 }, { "completion_length": 611.5, "epoch": 1.2120288248337028, "grad_norm": 0.3037776052951813, "kl": 0.10884921997785568, "learning_rate": 4.332757291809154e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4373 }, { "completion_length": 590.25, "epoch": 1.2123059866962307, "grad_norm": 0.367396742105484, "kl": 0.14512823522090912, "learning_rate": 4.332459513871679e-06, "loss": -0.0, "reward": 5.71875, "reward_std": 0.0625, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 4374 }, { "completion_length": 541.75, "epoch": 1.2125831485587584, "grad_norm": 0.38621148467063904, "kl": 0.09721064567565918, "learning_rate": 4.3321616797402935e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4375 }, { "completion_length": 750.5, "epoch": 1.212860310421286, "grad_norm": 0.30853256583213806, "kl": 0.10428164899349213, "learning_rate": 4.33186378942413e-06, "loss": 0.0, "reward": 5.5625, "reward_std": 0.375, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5625, "step": 4376 }, { "completion_length": 574.0, "epoch": 1.2131374722838137, "grad_norm": 0.0, "kl": 0.10436492413282394, "learning_rate": 4.331565842932325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4377 }, { "completion_length": 550.25, "epoch": 1.2134146341463414, "grad_norm": 0.0, "kl": 0.130195751786232, "learning_rate": 4.331267840274015e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4378 }, { "completion_length": 573.75, "epoch": 1.2136917960088691, "grad_norm": 0.0, "kl": 0.13402223587036133, "learning_rate": 4.330969781458338e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4379 }, { "completion_length": 608.25, "epoch": 1.2139689578713968, "grad_norm": 0.0, "kl": 0.11012452095746994, "learning_rate": 4.3306716664944345e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4380 }, { "completion_length": 562.25, "epoch": 1.2142461197339247, "grad_norm": 0.0, "kl": 0.12049033492803574, "learning_rate": 4.330373495391446e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4381 }, { "completion_length": 562.5, "epoch": 1.2145232815964524, "grad_norm": 0.36594218015670776, "kl": 0.10951598733663559, "learning_rate": 4.330075268158517e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4382 }, { "completion_length": 558.0, "epoch": 1.21480044345898, "grad_norm": 0.0, "kl": 0.12161804735660553, "learning_rate": 4.329776984804792e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4383 }, { "completion_length": 647.0, "epoch": 1.2150776053215078, "grad_norm": 0.28053662180900574, "kl": 0.12341504544019699, "learning_rate": 4.329478645339419e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4384 }, { "completion_length": 442.0, "epoch": 1.2153547671840355, "grad_norm": 0.41325610876083374, "kl": 0.17223809659481049, "learning_rate": 4.329180249771546e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4385 }, { "completion_length": 590.0, "epoch": 1.2156319290465631, "grad_norm": 0.0, "kl": 0.12414345890283585, "learning_rate": 4.328881798110324e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4386 }, { "completion_length": 638.5, "epoch": 1.2159090909090908, "grad_norm": 0.0, "kl": 0.11452877521514893, "learning_rate": 4.328583290364906e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4387 }, { "completion_length": 566.5, "epoch": 1.2161862527716187, "grad_norm": 0.0, "kl": 0.13571560382843018, "learning_rate": 4.3282847265444436e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4388 }, { "completion_length": 588.25, "epoch": 1.2164634146341464, "grad_norm": 0.0, "kl": 0.13061049580574036, "learning_rate": 4.327986106658096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4389 }, { "completion_length": 560.5, "epoch": 1.216740576496674, "grad_norm": 0.0, "kl": 0.13053591549396515, "learning_rate": 4.327687430715017e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4390 }, { "completion_length": 554.25, "epoch": 1.2170177383592018, "grad_norm": 0.37264084815979004, "kl": 0.16173206269741058, "learning_rate": 4.327388698724369e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4391 }, { "completion_length": 535.75, "epoch": 1.2172949002217295, "grad_norm": 0.0, "kl": 0.10840927809476852, "learning_rate": 4.32708991069531e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4392 }, { "completion_length": 592.75, "epoch": 1.2175720620842572, "grad_norm": 0.3374249041080475, "kl": 0.12228161841630936, "learning_rate": 4.3267910666370046e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4393 }, { "completion_length": 605.5, "epoch": 1.2178492239467849, "grad_norm": 0.3752693235874176, "kl": 0.13664782047271729, "learning_rate": 4.326492166558617e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4394 }, { "completion_length": 567.75, "epoch": 1.2181263858093128, "grad_norm": 0.33607587218284607, "kl": 0.10626862198114395, "learning_rate": 4.326193210469312e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4395 }, { "completion_length": 595.75, "epoch": 1.2184035476718404, "grad_norm": 0.36212286353111267, "kl": 0.11288202553987503, "learning_rate": 4.3258941983782585e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4396 }, { "completion_length": 585.5, "epoch": 1.2186807095343681, "grad_norm": 0.0, "kl": 0.11657356470823288, "learning_rate": 4.325595130294626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4397 }, { "completion_length": 585.5, "epoch": 1.2189578713968958, "grad_norm": 0.3358595669269562, "kl": 0.13562558591365814, "learning_rate": 4.325296006227583e-06, "loss": -0.0, "reward": 5.71875, "reward_std": 0.0625, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 4398 }, { "completion_length": 631.5, "epoch": 1.2192350332594235, "grad_norm": 0.0, "kl": 0.10806046426296234, "learning_rate": 4.324996826186306e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4399 }, { "completion_length": 560.75, "epoch": 1.2195121951219512, "grad_norm": 0.0, "kl": 0.13106371462345123, "learning_rate": 4.3246975901799685e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4400 }, { "completion_length": 551.5, "epoch": 1.2197893569844789, "grad_norm": 0.0, "kl": 0.1265946328639984, "learning_rate": 4.324398298217746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4401 }, { "completion_length": 544.75, "epoch": 1.2200665188470066, "grad_norm": 0.0, "kl": 0.11574920266866684, "learning_rate": 4.324098950308817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4402 }, { "completion_length": 576.5, "epoch": 1.2203436807095343, "grad_norm": 0.0, "kl": 0.1117011234164238, "learning_rate": 4.32379954646236e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4403 }, { "completion_length": 640.75, "epoch": 1.2206208425720622, "grad_norm": 0.0, "kl": 0.12433797121047974, "learning_rate": 4.32350008668756e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4404 }, { "completion_length": 922.0, "epoch": 1.2208980044345898, "grad_norm": 0.0, "kl": 0.09761221706867218, "learning_rate": 4.3232005709935965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4405 }, { "completion_length": 650.25, "epoch": 1.2211751662971175, "grad_norm": 0.0, "kl": 0.10949193686246872, "learning_rate": 4.322900999389656e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4406 }, { "completion_length": 624.25, "epoch": 1.2214523281596452, "grad_norm": 0.3051224648952484, "kl": 0.13004349172115326, "learning_rate": 4.322601371884925e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4407 }, { "completion_length": 583.5, "epoch": 1.221729490022173, "grad_norm": 0.3523942530155182, "kl": 0.14494211971759796, "learning_rate": 4.322301688488592e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4408 }, { "completion_length": 960.0, "epoch": 1.2220066518847006, "grad_norm": 0.4067169725894928, "kl": 0.27750906348228455, "learning_rate": 4.322001949209846e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4409 }, { "completion_length": 559.5, "epoch": 1.2222838137472283, "grad_norm": 0.0, "kl": 0.6394997239112854, "learning_rate": 4.321702154057881e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4410 }, { "completion_length": 521.0, "epoch": 1.2225609756097562, "grad_norm": 0.0, "kl": 0.10834235697984695, "learning_rate": 4.321402303041888e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4411 }, { "completion_length": 662.75, "epoch": 1.2228381374722839, "grad_norm": 0.30657923221588135, "kl": 0.08675573021173477, "learning_rate": 4.321102396171063e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4412 }, { "completion_length": 634.25, "epoch": 1.2231152993348116, "grad_norm": 0.0, "kl": 0.11095912009477615, "learning_rate": 4.320802433454604e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4413 }, { "completion_length": 624.5, "epoch": 1.2233924611973392, "grad_norm": 0.35263097286224365, "kl": 0.13597755134105682, "learning_rate": 4.320502414901708e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4414 }, { "completion_length": 545.25, "epoch": 1.223669623059867, "grad_norm": 0.0, "kl": 0.16517196595668793, "learning_rate": 4.320202340521576e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4415 }, { "completion_length": 627.25, "epoch": 1.2239467849223946, "grad_norm": 0.0, "kl": 0.10535718500614166, "learning_rate": 4.31990221032341e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4416 }, { "completion_length": 626.5, "epoch": 1.2242239467849223, "grad_norm": 0.0, "kl": 0.14205801486968994, "learning_rate": 4.319602024316415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4417 }, { "completion_length": 603.5, "epoch": 1.2245011086474502, "grad_norm": 0.0, "kl": 0.12468364089727402, "learning_rate": 4.319301782509794e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4418 }, { "completion_length": 969.5, "epoch": 1.224778270509978, "grad_norm": 0.3023553490638733, "kl": 0.08682762831449509, "learning_rate": 4.319001484912756e-06, "loss": 0.0, "reward": 4.71875, "reward_std": 1.980043649673462, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 4419 }, { "completion_length": 567.5, "epoch": 1.2250554323725056, "grad_norm": 0.0, "kl": 0.1262287199497223, "learning_rate": 4.318701131534509e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4420 }, { "completion_length": 537.5, "epoch": 1.2253325942350333, "grad_norm": 0.3672245740890503, "kl": 0.12435274571180344, "learning_rate": 4.318400722384264e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4421 }, { "completion_length": 673.25, "epoch": 1.225609756097561, "grad_norm": 0.0, "kl": 0.11663033068180084, "learning_rate": 4.318100257471233e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4422 }, { "completion_length": 532.5, "epoch": 1.2258869179600886, "grad_norm": 0.0, "kl": 0.10863722860813141, "learning_rate": 4.317799736804631e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4423 }, { "completion_length": 556.5, "epoch": 1.2261640798226163, "grad_norm": 0.0, "kl": 0.1420101374387741, "learning_rate": 4.317499160393672e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4424 }, { "completion_length": 600.0, "epoch": 1.2264412416851442, "grad_norm": 0.0, "kl": 0.13590162992477417, "learning_rate": 4.317198528247575e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4425 }, { "completion_length": 531.75, "epoch": 1.226718403547672, "grad_norm": 0.0, "kl": 0.11788466572761536, "learning_rate": 4.316897840375558e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4426 }, { "completion_length": 531.0, "epoch": 1.2269955654101996, "grad_norm": 0.0, "kl": 0.11808838695287704, "learning_rate": 4.316597096786843e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4427 }, { "completion_length": 639.75, "epoch": 1.2272727272727273, "grad_norm": 0.0, "kl": 0.10741445422172546, "learning_rate": 4.3162962974906515e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4428 }, { "completion_length": 659.75, "epoch": 1.227549889135255, "grad_norm": 0.0, "kl": 0.10149899125099182, "learning_rate": 4.315995442496208e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4429 }, { "completion_length": 602.0, "epoch": 1.2278270509977827, "grad_norm": 0.3613360524177551, "kl": 0.13808292150497437, "learning_rate": 4.315694531812739e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4430 }, { "completion_length": 547.0, "epoch": 1.2281042128603104, "grad_norm": 0.0, "kl": 0.11395564675331116, "learning_rate": 4.315393565449472e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4431 }, { "completion_length": 662.0, "epoch": 1.2283813747228383, "grad_norm": 0.40763944387435913, "kl": 0.1370207518339157, "learning_rate": 4.315092543415636e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4432 }, { "completion_length": 528.75, "epoch": 1.228658536585366, "grad_norm": 0.0, "kl": 0.13745544850826263, "learning_rate": 4.314791465720461e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4433 }, { "completion_length": 549.5, "epoch": 1.2289356984478936, "grad_norm": 0.0, "kl": 0.13780106604099274, "learning_rate": 4.314490332373182e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4434 }, { "completion_length": 617.0, "epoch": 1.2292128603104213, "grad_norm": 0.0, "kl": 0.1390172243118286, "learning_rate": 4.314189143383034e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4435 }, { "completion_length": 500.5, "epoch": 1.229490022172949, "grad_norm": 0.0, "kl": 0.1464107632637024, "learning_rate": 4.31388789875925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4436 }, { "completion_length": 589.5, "epoch": 1.2297671840354767, "grad_norm": 0.35034409165382385, "kl": 0.12725698947906494, "learning_rate": 4.31358659851107e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4437 }, { "completion_length": 578.75, "epoch": 1.2300443458980044, "grad_norm": 0.3584875762462616, "kl": 0.11915662884712219, "learning_rate": 4.313285242647735e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4438 }, { "completion_length": 525.5, "epoch": 1.230321507760532, "grad_norm": 0.0, "kl": 0.12050265818834305, "learning_rate": 4.312983831178483e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4439 }, { "completion_length": 582.25, "epoch": 1.2305986696230597, "grad_norm": 0.0, "kl": 0.1218058168888092, "learning_rate": 4.312682364112559e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4440 }, { "completion_length": 590.5, "epoch": 1.2308758314855877, "grad_norm": 0.39870914816856384, "kl": 0.1408717781305313, "learning_rate": 4.312380841459207e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4441 }, { "completion_length": 663.25, "epoch": 1.2311529933481153, "grad_norm": 0.3009794056415558, "kl": 0.11354565620422363, "learning_rate": 4.312079263227675e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4442 }, { "completion_length": 569.0, "epoch": 1.231430155210643, "grad_norm": 0.0, "kl": 0.12283731997013092, "learning_rate": 4.3117776294272095e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4443 }, { "completion_length": 621.25, "epoch": 1.2317073170731707, "grad_norm": 0.0, "kl": 0.12344485521316528, "learning_rate": 4.311475940067061e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4444 }, { "completion_length": 598.25, "epoch": 1.2319844789356984, "grad_norm": 0.0, "kl": 0.11570139229297638, "learning_rate": 4.311174195156481e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4445 }, { "completion_length": 484.0, "epoch": 1.232261640798226, "grad_norm": 0.0, "kl": 0.12345340102910995, "learning_rate": 4.310872394704722e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4446 }, { "completion_length": 619.25, "epoch": 1.2325388026607538, "grad_norm": 0.0, "kl": 0.21372751891613007, "learning_rate": 4.3105705387210405e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4447 }, { "completion_length": 633.75, "epoch": 1.2328159645232817, "grad_norm": 0.0, "kl": 0.1350010186433792, "learning_rate": 4.310268627214693e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4448 }, { "completion_length": 579.75, "epoch": 1.2330931263858094, "grad_norm": 0.0, "kl": 0.2034975290298462, "learning_rate": 4.309966660194936e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4449 }, { "completion_length": 553.25, "epoch": 1.233370288248337, "grad_norm": 0.3823844790458679, "kl": 0.15053677558898926, "learning_rate": 4.309664637671031e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4450 }, { "completion_length": 561.0, "epoch": 1.2336474501108647, "grad_norm": 0.33961576223373413, "kl": 0.16177235543727875, "learning_rate": 4.309362559652241e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4451 }, { "completion_length": 548.5, "epoch": 1.2339246119733924, "grad_norm": 0.0, "kl": 0.11906102299690247, "learning_rate": 4.309060426147826e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4452 }, { "completion_length": 657.25, "epoch": 1.2342017738359201, "grad_norm": 0.3033701479434967, "kl": 0.1429506242275238, "learning_rate": 4.3087582371670545e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4453 }, { "completion_length": 602.25, "epoch": 1.2344789356984478, "grad_norm": 0.0, "kl": 0.11932497471570969, "learning_rate": 4.308455992719192e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4454 }, { "completion_length": 590.25, "epoch": 1.2347560975609757, "grad_norm": 0.0, "kl": 0.11633021384477615, "learning_rate": 4.3081536928135055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4455 }, { "completion_length": 553.75, "epoch": 1.2350332594235034, "grad_norm": 0.3411112427711487, "kl": 0.15283600986003876, "learning_rate": 4.307851337459269e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4456 }, { "completion_length": 556.25, "epoch": 1.235310421286031, "grad_norm": 0.3609060049057007, "kl": 0.16368897259235382, "learning_rate": 4.307548926665752e-06, "loss": 0.0, "reward": 1.625, "reward_std": 0.25, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4457 }, { "completion_length": 566.5, "epoch": 1.2355875831485588, "grad_norm": 0.0, "kl": 0.12153633683919907, "learning_rate": 4.307246460442229e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4458 }, { "completion_length": 600.0, "epoch": 1.2358647450110865, "grad_norm": 0.0, "kl": 0.12596441805362701, "learning_rate": 4.306943938797974e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4459 }, { "completion_length": 592.0, "epoch": 1.2361419068736141, "grad_norm": 0.35277479887008667, "kl": 0.11842641979455948, "learning_rate": 4.306641361742266e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4460 }, { "completion_length": 516.5, "epoch": 1.2364190687361418, "grad_norm": 0.44335657358169556, "kl": 0.18142451345920563, "learning_rate": 4.306338729284383e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4461 }, { "completion_length": 492.0, "epoch": 1.2366962305986697, "grad_norm": 0.38151609897613525, "kl": 0.17285850644111633, "learning_rate": 4.306036041433604e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4462 }, { "completion_length": 506.75, "epoch": 1.2369733924611974, "grad_norm": 0.0, "kl": 0.15372654795646667, "learning_rate": 4.305733298199214e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4463 }, { "completion_length": 460.75, "epoch": 1.237250554323725, "grad_norm": 0.0, "kl": 0.1362394243478775, "learning_rate": 4.305430499590494e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4464 }, { "completion_length": 661.5, "epoch": 1.2375277161862528, "grad_norm": 0.0, "kl": 0.11656063050031662, "learning_rate": 4.305127645616732e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4465 }, { "completion_length": 725.5, "epoch": 1.2378048780487805, "grad_norm": 0.2611009180545807, "kl": 0.2308487445116043, "learning_rate": 4.304824736287214e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4466 }, { "completion_length": 548.5, "epoch": 1.2380820399113082, "grad_norm": 0.0, "kl": 0.4026051163673401, "learning_rate": 4.3045217716112295e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4467 }, { "completion_length": 504.0, "epoch": 1.2383592017738358, "grad_norm": 0.0, "kl": 0.17544180154800415, "learning_rate": 4.304218751598068e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4468 }, { "completion_length": 594.25, "epoch": 1.2386363636363638, "grad_norm": 0.0, "kl": 0.12302910536527634, "learning_rate": 4.303915676257024e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4469 }, { "completion_length": 478.5, "epoch": 1.2389135254988914, "grad_norm": 0.0, "kl": 0.14097580313682556, "learning_rate": 4.30361254559739e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4470 }, { "completion_length": 616.25, "epoch": 1.2391906873614191, "grad_norm": 0.35387474298477173, "kl": 0.11203436553478241, "learning_rate": 4.303309359628462e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4471 }, { "completion_length": 694.75, "epoch": 1.2394678492239468, "grad_norm": 0.0, "kl": 0.09946175664663315, "learning_rate": 4.303006118359536e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4472 }, { "completion_length": 510.0, "epoch": 1.2397450110864745, "grad_norm": 0.0, "kl": 0.13963516056537628, "learning_rate": 4.3027028217999145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4473 }, { "completion_length": 610.75, "epoch": 1.2400221729490022, "grad_norm": 0.0, "kl": 0.11594761162996292, "learning_rate": 4.3023994699588965e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4474 }, { "completion_length": 578.25, "epoch": 1.2402993348115299, "grad_norm": 0.36631864309310913, "kl": 0.16192372143268585, "learning_rate": 4.302096062845784e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4475 }, { "completion_length": 551.0, "epoch": 1.2405764966740576, "grad_norm": 0.0, "kl": 0.14399783313274384, "learning_rate": 4.301792600469882e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4476 }, { "completion_length": 630.0, "epoch": 1.2408536585365852, "grad_norm": 0.33578553795814514, "kl": 0.10501854866743088, "learning_rate": 4.301489082840496e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4477 }, { "completion_length": 720.0, "epoch": 1.2411308203991132, "grad_norm": 0.0, "kl": 0.1085025742650032, "learning_rate": 4.301185509966933e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4478 }, { "completion_length": 624.25, "epoch": 1.2414079822616408, "grad_norm": 0.0, "kl": 0.18016107380390167, "learning_rate": 4.300881881858505e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4479 }, { "completion_length": 504.25, "epoch": 1.2416851441241685, "grad_norm": 0.4183712303638458, "kl": 0.13307178020477295, "learning_rate": 4.30057819852452e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4480 }, { "completion_length": 634.25, "epoch": 1.2419623059866962, "grad_norm": 0.38288384675979614, "kl": 0.24080561101436615, "learning_rate": 4.300274459974292e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4481 }, { "completion_length": 579.75, "epoch": 1.242239467849224, "grad_norm": 0.0, "kl": 1.0313111543655396, "learning_rate": 4.299970666217135e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4482 }, { "completion_length": 636.25, "epoch": 1.2425166297117516, "grad_norm": 0.4425272047519684, "kl": 0.11129248142242432, "learning_rate": 4.299666817262366e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4483 }, { "completion_length": 590.25, "epoch": 1.2427937915742793, "grad_norm": 0.0, "kl": 0.12636220455169678, "learning_rate": 4.2993629131193015e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4484 }, { "completion_length": 647.0, "epoch": 1.2430709534368072, "grad_norm": 0.32143357396125793, "kl": 0.12120648473501205, "learning_rate": 4.299058953797262e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4485 }, { "completion_length": 627.25, "epoch": 1.2433481152993349, "grad_norm": 0.0, "kl": 0.13850121200084686, "learning_rate": 4.298754939305568e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4486 }, { "completion_length": 612.25, "epoch": 1.2436252771618626, "grad_norm": 0.0, "kl": 0.12106995284557343, "learning_rate": 4.2984508696535435e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4487 }, { "completion_length": 618.5, "epoch": 1.2439024390243902, "grad_norm": 0.0, "kl": 0.14341983199119568, "learning_rate": 4.298146744850511e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4488 }, { "completion_length": 615.25, "epoch": 1.244179600886918, "grad_norm": 0.0, "kl": 0.1670326292514801, "learning_rate": 4.297842564905799e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4489 }, { "completion_length": 529.75, "epoch": 1.2444567627494456, "grad_norm": 0.0, "kl": 0.15366651117801666, "learning_rate": 4.297538329828733e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4490 }, { "completion_length": 550.0, "epoch": 1.2447339246119733, "grad_norm": 0.0, "kl": 0.18739832937717438, "learning_rate": 4.2972340396286454e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4491 }, { "completion_length": 614.75, "epoch": 1.2450110864745012, "grad_norm": 0.0, "kl": 0.12097359448671341, "learning_rate": 4.296929694314866e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4492 }, { "completion_length": 597.75, "epoch": 1.245288248337029, "grad_norm": 0.3259754180908203, "kl": 0.18856307864189148, "learning_rate": 4.296625293896727e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4493 }, { "completion_length": 575.0, "epoch": 1.2455654101995566, "grad_norm": 0.0, "kl": 0.1605861485004425, "learning_rate": 4.296320838383565e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4494 }, { "completion_length": 599.0, "epoch": 1.2458425720620843, "grad_norm": 0.36738321185112, "kl": 0.1113121509552002, "learning_rate": 4.296016327784715e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4495 }, { "completion_length": 524.25, "epoch": 1.246119733924612, "grad_norm": 0.0, "kl": 0.1419651359319687, "learning_rate": 4.295711762109515e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4496 }, { "completion_length": 530.0, "epoch": 1.2463968957871396, "grad_norm": 0.0, "kl": 0.13245928287506104, "learning_rate": 4.295407141367306e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4497 }, { "completion_length": 609.0, "epoch": 1.2466740576496673, "grad_norm": 0.0, "kl": 0.14049607515335083, "learning_rate": 4.295102465567428e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4498 }, { "completion_length": 596.25, "epoch": 1.2469512195121952, "grad_norm": 0.0, "kl": 0.12666723132133484, "learning_rate": 4.2947977347192246e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4499 }, { "completion_length": 620.0, "epoch": 1.247228381374723, "grad_norm": 0.31574803590774536, "kl": 0.10592196136713028, "learning_rate": 4.294492948832042e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4500 }, { "completion_length": 622.0, "epoch": 1.2475055432372506, "grad_norm": 0.3184148371219635, "kl": 0.10329493135213852, "learning_rate": 4.294188107915225e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4501 }, { "completion_length": 595.25, "epoch": 1.2477827050997783, "grad_norm": 0.0, "kl": 0.1206185594201088, "learning_rate": 4.293883211978122e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4502 }, { "completion_length": 536.25, "epoch": 1.248059866962306, "grad_norm": 0.0, "kl": 0.12507081031799316, "learning_rate": 4.293578261030084e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4503 }, { "completion_length": 564.5, "epoch": 1.2483370288248337, "grad_norm": 0.36074063181877136, "kl": 0.1382526159286499, "learning_rate": 4.2932732550804615e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4504 }, { "completion_length": 710.25, "epoch": 1.2486141906873613, "grad_norm": 0.0, "kl": 0.11910375207662582, "learning_rate": 4.2929681941386075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4505 }, { "completion_length": 693.0, "epoch": 1.2488913525498893, "grad_norm": 0.0, "kl": 0.1260872483253479, "learning_rate": 4.292663078213878e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4506 }, { "completion_length": 553.25, "epoch": 1.249168514412417, "grad_norm": 0.0, "kl": 0.13866619765758514, "learning_rate": 4.2923579073156295e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4507 }, { "completion_length": 579.25, "epoch": 1.2494456762749446, "grad_norm": 0.0, "kl": 0.18108418583869934, "learning_rate": 4.292052681453219e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4508 }, { "completion_length": 610.25, "epoch": 1.2497228381374723, "grad_norm": 0.49244019389152527, "kl": 0.17073409259319305, "learning_rate": 4.291747400636009e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4509 }, { "completion_length": 502.25, "epoch": 1.25, "grad_norm": 0.0, "kl": 0.12724164128303528, "learning_rate": 4.291442064873359e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4510 }, { "completion_length": 594.0, "epoch": 1.2502771618625277, "grad_norm": 0.0, "kl": 0.126788929104805, "learning_rate": 4.291136674174633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4511 }, { "completion_length": 617.75, "epoch": 1.2505543237250554, "grad_norm": 0.0, "kl": 0.15671305358409882, "learning_rate": 4.290831228549196e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4512 }, { "completion_length": 616.25, "epoch": 1.2508314855875833, "grad_norm": 0.0, "kl": 0.12084470689296722, "learning_rate": 4.2905257280064146e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4513 }, { "completion_length": 635.75, "epoch": 1.2511086474501107, "grad_norm": 0.3933984041213989, "kl": 0.11702939122915268, "learning_rate": 4.290220172555659e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4514 }, { "completion_length": 567.0, "epoch": 1.2513858093126387, "grad_norm": 0.0, "kl": 0.16978861391544342, "learning_rate": 4.289914562206297e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4515 }, { "completion_length": 613.25, "epoch": 1.2516629711751663, "grad_norm": 0.29907113313674927, "kl": 0.13390451669692993, "learning_rate": 4.289608896967701e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4516 }, { "completion_length": 638.75, "epoch": 1.251940133037694, "grad_norm": 0.0, "kl": 0.1501794010400772, "learning_rate": 4.289303176849244e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4517 }, { "completion_length": 567.0, "epoch": 1.2522172949002217, "grad_norm": 0.0, "kl": 0.16422516107559204, "learning_rate": 4.288997401860303e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4518 }, { "completion_length": 579.5, "epoch": 1.2524944567627494, "grad_norm": 0.3537048399448395, "kl": 0.12599021196365356, "learning_rate": 4.288691572010254e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4519 }, { "completion_length": 637.25, "epoch": 1.252771618625277, "grad_norm": 0.0, "kl": 0.21359676122665405, "learning_rate": 4.288385687308474e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4520 }, { "completion_length": 576.25, "epoch": 1.2530487804878048, "grad_norm": 0.0, "kl": 0.11719514429569244, "learning_rate": 4.2880797477643456e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4521 }, { "completion_length": 574.0, "epoch": 1.2533259423503327, "grad_norm": 0.0, "kl": 0.12263014167547226, "learning_rate": 4.287773753387249e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4522 }, { "completion_length": 576.0, "epoch": 1.2536031042128604, "grad_norm": 0.31337064504623413, "kl": 0.22925147414207458, "learning_rate": 4.287467704186569e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4523 }, { "completion_length": 632.0, "epoch": 1.253880266075388, "grad_norm": 0.3627072274684906, "kl": 0.11207762360572815, "learning_rate": 4.287161600171688e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4524 }, { "completion_length": 684.25, "epoch": 1.2541574279379157, "grad_norm": 0.0, "kl": 0.12387135624885559, "learning_rate": 4.286855441351998e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4525 }, { "completion_length": 604.25, "epoch": 1.2544345898004434, "grad_norm": 0.0, "kl": 0.11713162064552307, "learning_rate": 4.286549227736883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4526 }, { "completion_length": 598.5, "epoch": 1.254711751662971, "grad_norm": 0.3617846965789795, "kl": 0.13674023747444153, "learning_rate": 4.286242959335736e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4527 }, { "completion_length": 562.75, "epoch": 1.2549889135254988, "grad_norm": 0.0, "kl": 0.15399488806724548, "learning_rate": 4.285936636157947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4528 }, { "completion_length": 574.25, "epoch": 1.2552660753880267, "grad_norm": 0.0, "kl": 0.1572464555501938, "learning_rate": 4.2856302582129114e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4529 }, { "completion_length": 578.75, "epoch": 1.2555432372505544, "grad_norm": 0.3497602641582489, "kl": 0.1375879943370819, "learning_rate": 4.285323825510024e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4530 }, { "completion_length": 558.75, "epoch": 1.255820399113082, "grad_norm": 0.0, "kl": 0.2478504776954651, "learning_rate": 4.285017338058681e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4531 }, { "completion_length": 579.0, "epoch": 1.2560975609756098, "grad_norm": 0.0, "kl": 0.17649085819721222, "learning_rate": 4.284710795868282e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4532 }, { "completion_length": 610.75, "epoch": 1.2563747228381374, "grad_norm": 0.0, "kl": 0.11641564965248108, "learning_rate": 4.284404198948227e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4533 }, { "completion_length": 564.75, "epoch": 1.2566518847006651, "grad_norm": 0.37882673740386963, "kl": 3.156237840652466, "learning_rate": 4.284097547307919e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4534 }, { "completion_length": 712.25, "epoch": 1.2569290465631928, "grad_norm": 0.0, "kl": 0.11008501797914505, "learning_rate": 4.2837908409567595e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4535 }, { "completion_length": 690.75, "epoch": 1.2572062084257207, "grad_norm": 0.0, "kl": 0.11535485088825226, "learning_rate": 4.283484079904157e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4536 }, { "completion_length": 683.5, "epoch": 1.2574833702882484, "grad_norm": 0.0, "kl": 0.12223698943853378, "learning_rate": 4.2831772641595145e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4537 }, { "completion_length": 561.25, "epoch": 1.257760532150776, "grad_norm": 0.0, "kl": 0.12869298458099365, "learning_rate": 4.282870393732245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4538 }, { "completion_length": 580.75, "epoch": 1.2580376940133038, "grad_norm": 0.5388805270195007, "kl": 0.1913289576768875, "learning_rate": 4.2825634686317565e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4539 }, { "completion_length": 583.25, "epoch": 1.2583148558758315, "grad_norm": 0.0, "kl": 0.11944164335727692, "learning_rate": 4.282256488867462e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4540 }, { "completion_length": 621.25, "epoch": 1.2585920177383592, "grad_norm": 0.0, "kl": 0.12614530324935913, "learning_rate": 4.281949454448775e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4541 }, { "completion_length": 563.0, "epoch": 1.2588691796008868, "grad_norm": 0.0, "kl": 0.14132440090179443, "learning_rate": 4.281642365385111e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4542 }, { "completion_length": 599.5, "epoch": 1.2591463414634148, "grad_norm": 0.0, "kl": 0.13397713005542755, "learning_rate": 4.281335221685886e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4543 }, { "completion_length": 659.0, "epoch": 1.2594235033259422, "grad_norm": 0.0, "kl": 0.13053388893604279, "learning_rate": 4.281028023360522e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4544 }, { "completion_length": 589.5, "epoch": 1.2597006651884701, "grad_norm": 0.0, "kl": 0.15125016868114471, "learning_rate": 4.280720770418435e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4545 }, { "completion_length": 594.0, "epoch": 1.2599778270509978, "grad_norm": 0.0, "kl": 0.12462747097015381, "learning_rate": 4.280413462869051e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4546 }, { "completion_length": 588.5, "epoch": 1.2602549889135255, "grad_norm": 0.0, "kl": 0.14592182636260986, "learning_rate": 4.280106100721793e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4547 }, { "completion_length": 583.75, "epoch": 1.2605321507760532, "grad_norm": 0.0, "kl": 0.1385865956544876, "learning_rate": 4.279798683986084e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4548 }, { "completion_length": 622.25, "epoch": 1.2608093126385809, "grad_norm": 0.33528125286102295, "kl": 0.150734081864357, "learning_rate": 4.279491212671355e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4549 }, { "completion_length": 589.75, "epoch": 1.2610864745011088, "grad_norm": 0.0, "kl": 0.1244419738650322, "learning_rate": 4.279183686787032e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4550 }, { "completion_length": 644.25, "epoch": 1.2613636363636362, "grad_norm": 0.0, "kl": 0.10464022308588028, "learning_rate": 4.278876106342547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4551 }, { "completion_length": 530.25, "epoch": 1.2616407982261642, "grad_norm": 0.44034892320632935, "kl": 0.11248954385519028, "learning_rate": 4.278568471347332e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4552 }, { "completion_length": 668.5, "epoch": 1.2619179600886918, "grad_norm": 0.0, "kl": 0.12434026598930359, "learning_rate": 4.27826078181082e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4553 }, { "completion_length": 615.25, "epoch": 1.2621951219512195, "grad_norm": 0.3435327410697937, "kl": 0.13487987220287323, "learning_rate": 4.277953037742447e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4554 }, { "completion_length": 650.25, "epoch": 1.2624722838137472, "grad_norm": 0.0, "kl": 0.12190176546573639, "learning_rate": 4.277645239151651e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4555 }, { "completion_length": 622.75, "epoch": 1.262749445676275, "grad_norm": 0.0, "kl": 0.14203789830207825, "learning_rate": 4.2773373860478705e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4556 }, { "completion_length": 536.25, "epoch": 1.2630266075388026, "grad_norm": 0.0, "kl": 0.37424612045288086, "learning_rate": 4.2770294784405455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4557 }, { "completion_length": 570.5, "epoch": 1.2633037694013303, "grad_norm": 0.388915091753006, "kl": 0.1651989221572876, "learning_rate": 4.276721516339119e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4558 }, { "completion_length": 631.25, "epoch": 1.2635809312638582, "grad_norm": 0.0, "kl": 0.10202694684267044, "learning_rate": 4.2764134997530345e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4559 }, { "completion_length": 492.0, "epoch": 1.2638580931263859, "grad_norm": 0.4265593886375427, "kl": 0.13389475643634796, "learning_rate": 4.276105428691737e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4560 }, { "completion_length": 678.75, "epoch": 1.2641352549889135, "grad_norm": 0.2863568365573883, "kl": 0.10555847734212875, "learning_rate": 4.275797303164675e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4561 }, { "completion_length": 777.0, "epoch": 1.2644124168514412, "grad_norm": 0.24517619609832764, "kl": 0.09658823907375336, "learning_rate": 4.275489123181297e-06, "loss": 0.0, "reward": 5.5625, "reward_std": 0.375, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5625, "step": 4562 }, { "completion_length": 568.0, "epoch": 1.264689578713969, "grad_norm": 0.525096595287323, "kl": 0.12050091475248337, "learning_rate": 4.2751808887510525e-06, "loss": 0.0, "reward": 4.09375, "reward_std": 3.3125, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 4563 }, { "completion_length": 626.75, "epoch": 1.2649667405764966, "grad_norm": 0.30183058977127075, "kl": 0.16863210499286652, "learning_rate": 4.274872599883396e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4564 }, { "completion_length": 709.75, "epoch": 1.2652439024390243, "grad_norm": 0.0, "kl": 0.14150579273700714, "learning_rate": 4.2745642565877795e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4565 }, { "completion_length": 686.75, "epoch": 1.2655210643015522, "grad_norm": 0.0, "kl": 0.10554756969213486, "learning_rate": 4.274255858873659e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4566 }, { "completion_length": 633.25, "epoch": 1.2657982261640799, "grad_norm": 0.39572155475616455, "kl": 0.5722262263298035, "learning_rate": 4.2739474067504925e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4567 }, { "completion_length": 612.75, "epoch": 1.2660753880266076, "grad_norm": 0.0, "kl": 0.1426003873348236, "learning_rate": 4.2736389002277385e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4568 }, { "completion_length": 764.0, "epoch": 1.2663525498891353, "grad_norm": 0.0, "kl": 0.11623169481754303, "learning_rate": 4.273330339314858e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4569 }, { "completion_length": 702.75, "epoch": 1.266629711751663, "grad_norm": 0.0, "kl": 0.1283016800880432, "learning_rate": 4.273021724021311e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4570 }, { "completion_length": 639.5, "epoch": 1.2669068736141906, "grad_norm": 0.0, "kl": 0.21745654940605164, "learning_rate": 4.272713054356565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4571 }, { "completion_length": 611.0, "epoch": 1.2671840354767183, "grad_norm": 0.0, "kl": 0.17739799618721008, "learning_rate": 4.272404330330084e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4572 }, { "completion_length": 596.25, "epoch": 1.2674611973392462, "grad_norm": 0.335440456867218, "kl": 0.13374954462051392, "learning_rate": 4.272095551951335e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4573 }, { "completion_length": 677.0, "epoch": 1.267738359201774, "grad_norm": 0.33585426211357117, "kl": 0.11100326478481293, "learning_rate": 4.271786719229787e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4574 }, { "completion_length": 693.75, "epoch": 1.2680155210643016, "grad_norm": 0.0, "kl": 0.1192009374499321, "learning_rate": 4.27147783217491e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4575 }, { "completion_length": 645.0, "epoch": 1.2682926829268293, "grad_norm": 0.41109341382980347, "kl": 0.14578628540039062, "learning_rate": 4.271168890796178e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4576 }, { "completion_length": 621.5, "epoch": 1.268569844789357, "grad_norm": 0.3186524510383606, "kl": 0.13245466351509094, "learning_rate": 4.2708598951030635e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4577 }, { "completion_length": 664.75, "epoch": 1.2688470066518847, "grad_norm": 0.0, "kl": 0.35741254687309265, "learning_rate": 4.270550845105043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4578 }, { "completion_length": 621.0, "epoch": 1.2691241685144123, "grad_norm": 0.0, "kl": 0.191336527466774, "learning_rate": 4.2702417408115935e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4579 }, { "completion_length": 631.0, "epoch": 1.2694013303769403, "grad_norm": 0.3416120707988739, "kl": 0.2083868384361267, "learning_rate": 4.269932582232193e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4580 }, { "completion_length": 584.25, "epoch": 1.2696784922394677, "grad_norm": 0.3184886872768402, "kl": 0.12196730822324753, "learning_rate": 4.269623369376323e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4581 }, { "completion_length": 583.75, "epoch": 1.2699556541019956, "grad_norm": 0.0, "kl": 0.21827100217342377, "learning_rate": 4.269314102253467e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4582 }, { "completion_length": 545.5, "epoch": 1.2702328159645233, "grad_norm": 0.0, "kl": 0.12956297397613525, "learning_rate": 4.269004780873106e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4583 }, { "completion_length": 702.0, "epoch": 1.270509977827051, "grad_norm": 0.0, "kl": 0.12029915302991867, "learning_rate": 4.268695405244729e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4584 }, { "completion_length": 604.5, "epoch": 1.2707871396895787, "grad_norm": 0.0, "kl": 0.11497974395751953, "learning_rate": 4.26838597537782e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4585 }, { "completion_length": 651.5, "epoch": 1.2710643015521064, "grad_norm": 0.0, "kl": 0.13045960664749146, "learning_rate": 4.26807649128187e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4586 }, { "completion_length": 609.25, "epoch": 1.2713414634146343, "grad_norm": 0.0, "kl": 0.14199946820735931, "learning_rate": 4.267766952966369e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4587 }, { "completion_length": 732.25, "epoch": 1.2716186252771617, "grad_norm": 0.0, "kl": 0.11126133054494858, "learning_rate": 4.26745736044081e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4588 }, { "completion_length": 604.5, "epoch": 1.2718957871396896, "grad_norm": 0.3634030818939209, "kl": 0.22048094868659973, "learning_rate": 4.267147713714686e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4589 }, { "completion_length": 516.0, "epoch": 1.2721729490022173, "grad_norm": 0.0, "kl": 0.11985735595226288, "learning_rate": 4.2668380127974915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4590 }, { "completion_length": 636.25, "epoch": 1.272450110864745, "grad_norm": 0.3579758107662201, "kl": 0.12219257652759552, "learning_rate": 4.266528257698725e-06, "loss": -0.0, "reward": 5.71875, "reward_std": 0.0625, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 4591 }, { "completion_length": 611.0, "epoch": 1.2727272727272727, "grad_norm": 0.0, "kl": 0.1558423936367035, "learning_rate": 4.266218448427887e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4592 }, { "completion_length": 649.0, "epoch": 1.2730044345898004, "grad_norm": 0.3727573752403259, "kl": 0.17708300054073334, "learning_rate": 4.265908584994476e-06, "loss": -0.0, "reward": 2.09375, "reward_std": 2.7336158752441406, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 4593 }, { "completion_length": 599.75, "epoch": 1.273281596452328, "grad_norm": 0.0, "kl": 0.13478891551494598, "learning_rate": 4.265598667407994e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4594 }, { "completion_length": 729.0, "epoch": 1.2735587583148558, "grad_norm": 0.3890216648578644, "kl": 0.4212542474269867, "learning_rate": 4.265288695677946e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4595 }, { "completion_length": 671.75, "epoch": 1.2738359201773837, "grad_norm": 0.0, "kl": 0.13126078248023987, "learning_rate": 4.264978669813837e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4596 }, { "completion_length": 564.0, "epoch": 1.2741130820399114, "grad_norm": 0.0, "kl": 0.23340430855751038, "learning_rate": 4.264668589825175e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4597 }, { "completion_length": 674.75, "epoch": 1.274390243902439, "grad_norm": 0.32426711916923523, "kl": 0.1825837641954422, "learning_rate": 4.264358455721467e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4598 }, { "completion_length": 594.25, "epoch": 1.2746674057649667, "grad_norm": 0.0, "kl": 0.11640321463346481, "learning_rate": 4.264048267512225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4599 }, { "completion_length": 561.5, "epoch": 1.2749445676274944, "grad_norm": 0.5532286763191223, "kl": 2.4179847240448, "learning_rate": 4.263738025206961e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4600 }, { "completion_length": 597.0, "epoch": 1.275221729490022, "grad_norm": 0.38201841711997986, "kl": 0.1253589540719986, "learning_rate": 4.263427728815189e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4601 }, { "completion_length": 636.75, "epoch": 1.2754988913525498, "grad_norm": 0.0, "kl": 0.141259104013443, "learning_rate": 4.263117378346425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4602 }, { "completion_length": 661.0, "epoch": 1.2757760532150777, "grad_norm": 0.41820111870765686, "kl": 0.1123817190527916, "learning_rate": 4.262806973810184e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4603 }, { "completion_length": 635.75, "epoch": 1.2760532150776054, "grad_norm": 0.44699397683143616, "kl": 0.15091215074062347, "learning_rate": 4.262496515215987e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4604 }, { "completion_length": 640.0, "epoch": 1.276330376940133, "grad_norm": 0.0, "kl": 0.11956818401813507, "learning_rate": 4.262186002573354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4605 }, { "completion_length": 509.0, "epoch": 1.2766075388026608, "grad_norm": 0.0, "kl": 0.13782501220703125, "learning_rate": 4.261875435891806e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4606 }, { "completion_length": 627.25, "epoch": 1.2768847006651884, "grad_norm": 0.3216615617275238, "kl": 0.11081140488386154, "learning_rate": 4.261564815180868e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4607 }, { "completion_length": 647.5, "epoch": 1.2771618625277161, "grad_norm": 0.0, "kl": 1.0644334554672241, "learning_rate": 4.261254140450065e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4608 }, { "completion_length": 617.0, "epoch": 1.2774390243902438, "grad_norm": 0.42730480432510376, "kl": 0.1276162713766098, "learning_rate": 4.2609434117089236e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4609 }, { "completion_length": 580.75, "epoch": 1.2777161862527717, "grad_norm": 0.0, "kl": 0.14751793444156647, "learning_rate": 4.260632628966974e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4610 }, { "completion_length": 637.5, "epoch": 1.2779933481152994, "grad_norm": 0.3675075173377991, "kl": 0.11436954885721207, "learning_rate": 4.260321792233745e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4611 }, { "completion_length": 600.25, "epoch": 1.278270509977827, "grad_norm": 0.3932536840438843, "kl": 0.21522049605846405, "learning_rate": 4.26001090151877e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4612 }, { "completion_length": 555.5, "epoch": 1.2785476718403548, "grad_norm": 0.0, "kl": 0.18194788694381714, "learning_rate": 4.259699956831582e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4613 }, { "completion_length": 605.25, "epoch": 1.2788248337028825, "grad_norm": 0.3246113359928131, "kl": 0.11488191038370132, "learning_rate": 4.259388958181716e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4614 }, { "completion_length": 616.0, "epoch": 1.2791019955654102, "grad_norm": 0.0, "kl": 0.13108296692371368, "learning_rate": 4.259077905578709e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4615 }, { "completion_length": 548.0, "epoch": 1.2793791574279378, "grad_norm": 0.0, "kl": 0.6775055527687073, "learning_rate": 4.258766799032101e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4616 }, { "completion_length": 609.25, "epoch": 1.2796563192904657, "grad_norm": 0.0, "kl": 0.11055517196655273, "learning_rate": 4.258455638551432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4617 }, { "completion_length": 535.5, "epoch": 1.2799334811529932, "grad_norm": 0.0, "kl": 0.15365684032440186, "learning_rate": 4.258144424146243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4618 }, { "completion_length": 595.5, "epoch": 1.2802106430155211, "grad_norm": 0.3267843723297119, "kl": 0.14093747735023499, "learning_rate": 4.2578331558260784e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4619 }, { "completion_length": 624.25, "epoch": 1.2804878048780488, "grad_norm": 0.0, "kl": 0.12748046219348907, "learning_rate": 4.257521833600484e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4620 }, { "completion_length": 635.5, "epoch": 1.2807649667405765, "grad_norm": 0.0, "kl": 0.1400149166584015, "learning_rate": 4.257210457479005e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4621 }, { "completion_length": 713.25, "epoch": 1.2810421286031042, "grad_norm": 0.0, "kl": 0.10479225963354111, "learning_rate": 4.2568990274711915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4622 }, { "completion_length": 635.25, "epoch": 1.2813192904656319, "grad_norm": 0.32742592692375183, "kl": 0.31337451934814453, "learning_rate": 4.2565875435865935e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4623 }, { "completion_length": 649.0, "epoch": 1.2815964523281598, "grad_norm": 0.3644365966320038, "kl": 0.17104510962963104, "learning_rate": 4.256276005834762e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4624 }, { "completion_length": 676.0, "epoch": 1.2818736141906872, "grad_norm": 0.0, "kl": 0.15909062325954437, "learning_rate": 4.255964414225252e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4625 }, { "completion_length": 651.25, "epoch": 1.2821507760532151, "grad_norm": 0.0, "kl": 0.11515356600284576, "learning_rate": 4.255652768767619e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4626 }, { "completion_length": 620.75, "epoch": 1.2824279379157428, "grad_norm": 0.35000723600387573, "kl": 0.1278543472290039, "learning_rate": 4.255341069471417e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4627 }, { "completion_length": 645.75, "epoch": 1.2827050997782705, "grad_norm": 0.32650303840637207, "kl": 0.36399003863334656, "learning_rate": 4.255029316346208e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4628 }, { "completion_length": 611.25, "epoch": 1.2829822616407982, "grad_norm": 0.0, "kl": 0.1783749908208847, "learning_rate": 4.2547175094015495e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4629 }, { "completion_length": 678.75, "epoch": 1.283259423503326, "grad_norm": 0.5728721618652344, "kl": 0.18212023377418518, "learning_rate": 4.254405648647004e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4630 }, { "completion_length": 595.5, "epoch": 1.2835365853658536, "grad_norm": 0.38979974389076233, "kl": 0.2562377154827118, "learning_rate": 4.254093734092137e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4631 }, { "completion_length": 640.25, "epoch": 1.2838137472283813, "grad_norm": 0.0, "kl": 0.13232719898223877, "learning_rate": 4.253781765746511e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4632 }, { "completion_length": 591.0, "epoch": 1.2840909090909092, "grad_norm": 0.0, "kl": 0.12515637278556824, "learning_rate": 4.2534697436196944e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4633 }, { "completion_length": 552.5, "epoch": 1.2843680709534369, "grad_norm": 0.0, "kl": 0.18509528040885925, "learning_rate": 4.2531576677212545e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4634 }, { "completion_length": 774.0, "epoch": 1.2846452328159645, "grad_norm": 0.3649677336215973, "kl": 0.11102068424224854, "learning_rate": 4.252845538060763e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4635 }, { "completion_length": 645.75, "epoch": 1.2849223946784922, "grad_norm": 0.0, "kl": 0.16444425284862518, "learning_rate": 4.252533354647789e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4636 }, { "completion_length": 620.25, "epoch": 1.28519955654102, "grad_norm": 0.0, "kl": 0.15417274832725525, "learning_rate": 4.252221117491907e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4637 }, { "completion_length": 547.5, "epoch": 1.2854767184035476, "grad_norm": 0.0, "kl": 0.13927529752254486, "learning_rate": 4.251908826602694e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4638 }, { "completion_length": 812.25, "epoch": 1.2857538802660753, "grad_norm": 0.0, "kl": 0.10007453709840775, "learning_rate": 4.251596481989724e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4639 }, { "completion_length": 515.25, "epoch": 1.2860310421286032, "grad_norm": 0.3551638126373291, "kl": 0.16847893595695496, "learning_rate": 4.2512840836625765e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4640 }, { "completion_length": 636.75, "epoch": 1.2863082039911309, "grad_norm": 0.3199405372142792, "kl": 0.11771412193775177, "learning_rate": 4.250971631630832e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4641 }, { "completion_length": 790.75, "epoch": 1.2865853658536586, "grad_norm": 0.0, "kl": 0.1316555291414261, "learning_rate": 4.250659125904071e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4642 }, { "completion_length": 593.75, "epoch": 1.2868625277161863, "grad_norm": 0.0, "kl": 0.13249753415584564, "learning_rate": 4.250346566491877e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4643 }, { "completion_length": 634.5, "epoch": 1.287139689578714, "grad_norm": 0.0, "kl": 0.17384451627731323, "learning_rate": 4.250033953403835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4644 }, { "completion_length": 469.75, "epoch": 1.2874168514412416, "grad_norm": 0.0, "kl": 0.18221846222877502, "learning_rate": 4.249721286649532e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4645 }, { "completion_length": 615.75, "epoch": 1.2876940133037693, "grad_norm": 0.0, "kl": 0.13132911920547485, "learning_rate": 4.249408566238555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4646 }, { "completion_length": 574.25, "epoch": 1.2879711751662972, "grad_norm": 0.0, "kl": 0.10702449083328247, "learning_rate": 4.249095792180496e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4647 }, { "completion_length": 647.75, "epoch": 1.288248337028825, "grad_norm": 0.0, "kl": 0.13282272219657898, "learning_rate": 4.2487829644849445e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4648 }, { "completion_length": 567.25, "epoch": 1.2885254988913526, "grad_norm": 0.45927560329437256, "kl": 0.1769946962594986, "learning_rate": 4.248470083161494e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4649 }, { "completion_length": 631.0, "epoch": 1.2888026607538803, "grad_norm": 0.0, "kl": 0.1332208663225174, "learning_rate": 4.248157148219739e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4650 }, { "completion_length": 568.0, "epoch": 1.289079822616408, "grad_norm": 0.0, "kl": 0.23000586032867432, "learning_rate": 4.247844159669276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4651 }, { "completion_length": 601.5, "epoch": 1.2893569844789357, "grad_norm": 0.0, "kl": 0.12082928419113159, "learning_rate": 4.247531117519705e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4652 }, { "completion_length": 600.75, "epoch": 1.2896341463414633, "grad_norm": 0.0, "kl": 0.11682623624801636, "learning_rate": 4.2472180217806245e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4653 }, { "completion_length": 549.5, "epoch": 1.2899113082039912, "grad_norm": 0.0, "kl": 0.135982483625412, "learning_rate": 4.2469048724616345e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4654 }, { "completion_length": 721.0, "epoch": 1.2901884700665187, "grad_norm": 0.0, "kl": 0.12480293214321136, "learning_rate": 4.246591669572339e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4655 }, { "completion_length": 536.25, "epoch": 1.2904656319290466, "grad_norm": 0.0, "kl": 0.14952485263347626, "learning_rate": 4.246278413122344e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4656 }, { "completion_length": 668.0, "epoch": 1.2907427937915743, "grad_norm": 0.0, "kl": 0.10578051954507828, "learning_rate": 4.245965103121253e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4657 }, { "completion_length": 510.5, "epoch": 1.291019955654102, "grad_norm": 0.0, "kl": 0.14763236045837402, "learning_rate": 4.2456517395786745e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4658 }, { "completion_length": 542.75, "epoch": 1.2912971175166297, "grad_norm": 0.3579481542110443, "kl": 0.16066837310791016, "learning_rate": 4.24533832250422e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4659 }, { "completion_length": 599.25, "epoch": 1.2915742793791574, "grad_norm": 0.0, "kl": 0.20578670501708984, "learning_rate": 4.2450248519075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4660 }, { "completion_length": 519.25, "epoch": 1.2918514412416853, "grad_norm": 0.0, "kl": 0.13708806037902832, "learning_rate": 4.244711327798126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4661 }, { "completion_length": 583.5, "epoch": 1.2921286031042127, "grad_norm": 0.0, "kl": 0.15596258640289307, "learning_rate": 4.244397750185714e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4662 }, { "completion_length": 643.25, "epoch": 1.2924057649667406, "grad_norm": 0.448910653591156, "kl": 0.12088140845298767, "learning_rate": 4.244084119079879e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4663 }, { "completion_length": 643.0, "epoch": 1.2926829268292683, "grad_norm": 0.0, "kl": 0.12366490066051483, "learning_rate": 4.24377043449024e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4664 }, { "completion_length": 591.75, "epoch": 1.292960088691796, "grad_norm": 0.0, "kl": 0.12428338825702667, "learning_rate": 4.243456696426415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4665 }, { "completion_length": 508.0, "epoch": 1.2932372505543237, "grad_norm": 0.0, "kl": 0.15782807767391205, "learning_rate": 4.243142904898026e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4666 }, { "completion_length": 589.75, "epoch": 1.2935144124168514, "grad_norm": 0.3974420726299286, "kl": 0.13693873584270477, "learning_rate": 4.242829059914695e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4667 }, { "completion_length": 638.75, "epoch": 1.293791574279379, "grad_norm": 0.0, "kl": 0.13688775897026062, "learning_rate": 4.242515161486047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4668 }, { "completion_length": 547.5, "epoch": 1.2940687361419068, "grad_norm": 0.0, "kl": 0.14480876922607422, "learning_rate": 4.242201209621708e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4669 }, { "completion_length": 549.25, "epoch": 1.2943458980044347, "grad_norm": 0.43664148449897766, "kl": 0.14087438583374023, "learning_rate": 4.241887204331306e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4670 }, { "completion_length": 573.0, "epoch": 1.2946230598669624, "grad_norm": 0.0, "kl": 0.12242491543292999, "learning_rate": 4.241573145624468e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4671 }, { "completion_length": 610.5, "epoch": 1.29490022172949, "grad_norm": 0.0, "kl": 0.13488224148750305, "learning_rate": 4.241259033510827e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4672 }, { "completion_length": 979.0, "epoch": 1.2951773835920177, "grad_norm": 0.2384958267211914, "kl": 0.1023096889257431, "learning_rate": 4.2409448680000144e-06, "loss": -0.0, "reward": 1.71875, "reward_std": 0.0625, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 4673 }, { "completion_length": 589.5, "epoch": 1.2954545454545454, "grad_norm": 0.0, "kl": 0.13351687788963318, "learning_rate": 4.240630649101666e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4674 }, { "completion_length": 626.0, "epoch": 1.295731707317073, "grad_norm": 0.0, "kl": 0.1145675852894783, "learning_rate": 4.240316376825415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4675 }, { "completion_length": 670.5, "epoch": 1.2960088691796008, "grad_norm": 0.0, "kl": 0.5630763173103333, "learning_rate": 4.240002051180902e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4676 }, { "completion_length": 550.25, "epoch": 1.2962860310421287, "grad_norm": 0.0, "kl": 0.15808109939098358, "learning_rate": 4.239687672177762e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4677 }, { "completion_length": 621.0, "epoch": 1.2965631929046564, "grad_norm": 0.3584040403366089, "kl": 0.1437191665172577, "learning_rate": 4.23937323982564e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4678 }, { "completion_length": 631.0, "epoch": 1.296840354767184, "grad_norm": 0.0, "kl": 0.14931169152259827, "learning_rate": 4.239058754134176e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4679 }, { "completion_length": 523.5, "epoch": 1.2971175166297118, "grad_norm": 0.0, "kl": 0.14678902924060822, "learning_rate": 4.238744215113013e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4680 }, { "completion_length": 492.5, "epoch": 1.2973946784922394, "grad_norm": 0.0, "kl": 0.15645383298397064, "learning_rate": 4.2384296227717995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4681 }, { "completion_length": 520.25, "epoch": 1.2976718403547671, "grad_norm": 0.0, "kl": 0.1358046978712082, "learning_rate": 4.23811497712018e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4682 }, { "completion_length": 683.5, "epoch": 1.2979490022172948, "grad_norm": 0.0, "kl": 0.12430063635110855, "learning_rate": 4.2378002781678055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4683 }, { "completion_length": 569.5, "epoch": 1.2982261640798227, "grad_norm": 0.0, "kl": 0.12010722607374191, "learning_rate": 4.237485525924325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4684 }, { "completion_length": 592.75, "epoch": 1.2985033259423504, "grad_norm": 0.2936214804649353, "kl": 0.15382224321365356, "learning_rate": 4.237170720399392e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4685 }, { "completion_length": 591.25, "epoch": 1.298780487804878, "grad_norm": 0.4322100579738617, "kl": 0.16141986846923828, "learning_rate": 4.236855861602659e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4686 }, { "completion_length": 584.75, "epoch": 1.2990576496674058, "grad_norm": 0.0, "kl": 0.14188872277736664, "learning_rate": 4.236540949543782e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4687 }, { "completion_length": 561.25, "epoch": 1.2993348115299335, "grad_norm": 0.3283429443836212, "kl": 0.13758650422096252, "learning_rate": 4.236225984232417e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4688 }, { "completion_length": 552.0, "epoch": 1.2996119733924612, "grad_norm": 0.0, "kl": 0.114850714802742, "learning_rate": 4.235910965678224e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4689 }, { "completion_length": 627.75, "epoch": 1.2998891352549888, "grad_norm": 0.0, "kl": 0.14380818605422974, "learning_rate": 4.235595893890863e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4690 }, { "completion_length": 527.5, "epoch": 1.3001662971175167, "grad_norm": 0.0, "kl": 0.13828273117542267, "learning_rate": 4.235280768879996e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4691 }, { "completion_length": 550.25, "epoch": 1.3004434589800442, "grad_norm": 0.4260905086994171, "kl": 0.14624105393886566, "learning_rate": 4.234965590655287e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4692 }, { "completion_length": 622.25, "epoch": 1.3007206208425721, "grad_norm": 0.0, "kl": 0.14134056866168976, "learning_rate": 4.234650359226399e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4693 }, { "completion_length": 619.0, "epoch": 1.3009977827050998, "grad_norm": 0.4338120222091675, "kl": 0.11898421496152878, "learning_rate": 4.234335074603001e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4694 }, { "completion_length": 639.75, "epoch": 1.3012749445676275, "grad_norm": 0.0, "kl": 0.11877471953630447, "learning_rate": 4.234019736794762e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4695 }, { "completion_length": 570.5, "epoch": 1.3015521064301552, "grad_norm": 0.0, "kl": 0.1407477855682373, "learning_rate": 4.233704345811351e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4696 }, { "completion_length": 611.0, "epoch": 1.3018292682926829, "grad_norm": 0.0, "kl": 0.13274677097797394, "learning_rate": 4.233388901662438e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4697 }, { "completion_length": 542.75, "epoch": 1.3021064301552108, "grad_norm": 0.0, "kl": 0.1134430319070816, "learning_rate": 4.233073404357699e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4698 }, { "completion_length": 650.0, "epoch": 1.3023835920177382, "grad_norm": 0.30942395329475403, "kl": 0.23101602494716644, "learning_rate": 4.232757853906808e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4699 }, { "completion_length": 605.75, "epoch": 1.3026607538802661, "grad_norm": 0.0, "kl": 0.13365283608436584, "learning_rate": 4.2324422503194425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4700 }, { "completion_length": 668.0, "epoch": 1.3029379157427938, "grad_norm": 0.0, "kl": 0.13479869067668915, "learning_rate": 4.2321265936052795e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4701 }, { "completion_length": 541.25, "epoch": 1.3032150776053215, "grad_norm": 0.3806952238082886, "kl": 0.16564786434173584, "learning_rate": 4.231810883773999e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4702 }, { "completion_length": 655.25, "epoch": 1.3034922394678492, "grad_norm": 0.4111127257347107, "kl": 0.14147886633872986, "learning_rate": 4.231495120835283e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4703 }, { "completion_length": 522.25, "epoch": 1.3037694013303769, "grad_norm": 0.5001562237739563, "kl": 0.13738901913166046, "learning_rate": 4.2311793047988145e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4704 }, { "completion_length": 548.5, "epoch": 1.3040465631929046, "grad_norm": 0.3499992787837982, "kl": 0.14452888071537018, "learning_rate": 4.230863435674278e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4705 }, { "completion_length": 713.25, "epoch": 1.3043237250554323, "grad_norm": 0.38441574573516846, "kl": 0.20874054729938507, "learning_rate": 4.2305475134713606e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4706 }, { "completion_length": 538.5, "epoch": 1.3046008869179602, "grad_norm": 0.48428261280059814, "kl": 0.12905292212963104, "learning_rate": 4.23023153819975e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4707 }, { "completion_length": 636.5, "epoch": 1.3048780487804879, "grad_norm": 0.4223211407661438, "kl": 0.1510411500930786, "learning_rate": 4.229915509869136e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4708 }, { "completion_length": 595.75, "epoch": 1.3051552106430155, "grad_norm": 0.0, "kl": 0.1409638226032257, "learning_rate": 4.229599428489208e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4709 }, { "completion_length": 649.0, "epoch": 1.3054323725055432, "grad_norm": 0.0, "kl": 0.14417186379432678, "learning_rate": 4.229283294069662e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4710 }, { "completion_length": 636.75, "epoch": 1.305709534368071, "grad_norm": 0.35202091932296753, "kl": 0.1696942001581192, "learning_rate": 4.22896710662019e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4711 }, { "completion_length": 557.75, "epoch": 1.3059866962305986, "grad_norm": 0.0, "kl": 0.10605372488498688, "learning_rate": 4.22865086615049e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4712 }, { "completion_length": 575.5, "epoch": 1.3062638580931263, "grad_norm": 0.0, "kl": 0.14118321239948273, "learning_rate": 4.228334572670258e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4713 }, { "completion_length": 572.25, "epoch": 1.3065410199556542, "grad_norm": 0.0, "kl": 0.12179452180862427, "learning_rate": 4.228018226189196e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4714 }, { "completion_length": 554.75, "epoch": 1.3068181818181819, "grad_norm": 0.0, "kl": 0.11531572043895721, "learning_rate": 4.227701826717002e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4715 }, { "completion_length": 589.5, "epoch": 1.3070953436807096, "grad_norm": 0.0, "kl": 0.12073136121034622, "learning_rate": 4.227385374263381e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4716 }, { "completion_length": 602.5, "epoch": 1.3073725055432373, "grad_norm": 0.0, "kl": 0.12585461139678955, "learning_rate": 4.227068868838035e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4717 }, { "completion_length": 590.0, "epoch": 1.307649667405765, "grad_norm": 0.0, "kl": 0.1136845275759697, "learning_rate": 4.226752310450672e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4718 }, { "completion_length": 604.25, "epoch": 1.3079268292682926, "grad_norm": 0.0, "kl": 0.13939006626605988, "learning_rate": 4.226435699110999e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4719 }, { "completion_length": 653.25, "epoch": 1.3082039911308203, "grad_norm": 0.0, "kl": 0.14190825819969177, "learning_rate": 4.226119034828724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4720 }, { "completion_length": 807.5, "epoch": 1.3084811529933482, "grad_norm": 0.0, "kl": 0.114815354347229, "learning_rate": 4.22580231761356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4721 }, { "completion_length": 672.75, "epoch": 1.308758314855876, "grad_norm": 0.3631332516670227, "kl": 0.10534173995256424, "learning_rate": 4.225485547475217e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4722 }, { "completion_length": 621.5, "epoch": 1.3090354767184036, "grad_norm": 0.0, "kl": 0.11583984643220901, "learning_rate": 4.225168724423411e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4723 }, { "completion_length": 631.25, "epoch": 1.3093126385809313, "grad_norm": 0.35585111379623413, "kl": 0.12796199321746826, "learning_rate": 4.224851848467856e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4724 }, { "completion_length": 531.5, "epoch": 1.309589800443459, "grad_norm": 0.0, "kl": 0.2304421067237854, "learning_rate": 4.22453491961827e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4725 }, { "completion_length": 653.75, "epoch": 1.3098669623059866, "grad_norm": 0.0, "kl": 0.105611152946949, "learning_rate": 4.224217937884371e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4726 }, { "completion_length": 638.0, "epoch": 1.3101441241685143, "grad_norm": 0.0, "kl": 0.12697121500968933, "learning_rate": 4.223900903275883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4727 }, { "completion_length": 620.0, "epoch": 1.3104212860310422, "grad_norm": 0.0, "kl": 0.11723436415195465, "learning_rate": 4.223583815802523e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4728 }, { "completion_length": 659.75, "epoch": 1.3106984478935697, "grad_norm": 0.0, "kl": 0.13717728853225708, "learning_rate": 4.2232666754740185e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4729 }, { "completion_length": 622.5, "epoch": 1.3109756097560976, "grad_norm": 0.0, "kl": 0.11779782176017761, "learning_rate": 4.222949482300094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4730 }, { "completion_length": 630.5, "epoch": 1.3112527716186253, "grad_norm": 0.0, "kl": 1.4460902214050293, "learning_rate": 4.222632236290475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4731 }, { "completion_length": 590.0, "epoch": 1.311529933481153, "grad_norm": 0.0, "kl": 0.1423240751028061, "learning_rate": 4.222314937454892e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4732 }, { "completion_length": 586.0, "epoch": 1.3118070953436807, "grad_norm": 0.0, "kl": 0.13441944122314453, "learning_rate": 4.221997585803075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4733 }, { "completion_length": 610.5, "epoch": 1.3120842572062084, "grad_norm": 0.0, "kl": 0.16522926092147827, "learning_rate": 4.221680181344755e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4734 }, { "completion_length": 604.25, "epoch": 1.3123614190687363, "grad_norm": 0.34881362318992615, "kl": 0.12827825546264648, "learning_rate": 4.221362724089665e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4735 }, { "completion_length": 639.75, "epoch": 1.3126385809312637, "grad_norm": 0.2923438847064972, "kl": 0.12784960865974426, "learning_rate": 4.221045214047542e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4736 }, { "completion_length": 614.0, "epoch": 1.3129157427937916, "grad_norm": 0.3884676992893219, "kl": 0.13118168711662292, "learning_rate": 4.220727651228122e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4737 }, { "completion_length": 755.5, "epoch": 1.3131929046563193, "grad_norm": 0.4410278797149658, "kl": 0.25320708751678467, "learning_rate": 4.220410035641142e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4738 }, { "completion_length": 704.5, "epoch": 1.313470066518847, "grad_norm": 0.3917783200740814, "kl": 0.14685866236686707, "learning_rate": 4.220092367296343e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4739 }, { "completion_length": 600.0, "epoch": 1.3137472283813747, "grad_norm": 0.0, "kl": 0.13905979692935944, "learning_rate": 4.219774646203467e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4740 }, { "completion_length": 593.0, "epoch": 1.3140243902439024, "grad_norm": 0.4109801650047302, "kl": 0.14692021906375885, "learning_rate": 4.219456872372256e-06, "loss": -0.0, "reward": 1.875, "reward_std": 0.25, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4741 }, { "completion_length": 584.0, "epoch": 1.3143015521064303, "grad_norm": 0.0, "kl": 0.1382368952035904, "learning_rate": 4.219139045812456e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4742 }, { "completion_length": 549.5, "epoch": 1.3145787139689578, "grad_norm": 0.0, "kl": 0.21609754860401154, "learning_rate": 4.218821166533813e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4743 }, { "completion_length": 704.5, "epoch": 1.3148558758314857, "grad_norm": 0.30976954102516174, "kl": 0.10061033815145493, "learning_rate": 4.2185032345460745e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4744 }, { "completion_length": 634.0, "epoch": 1.3151330376940134, "grad_norm": 0.0, "kl": 0.12666641175746918, "learning_rate": 4.218185249858991e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4745 }, { "completion_length": 587.5, "epoch": 1.315410199556541, "grad_norm": 0.0, "kl": 0.19888606667518616, "learning_rate": 4.217867212482313e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4746 }, { "completion_length": 651.5, "epoch": 1.3156873614190687, "grad_norm": 0.0, "kl": 0.15028612315654755, "learning_rate": 4.217549122425795e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4747 }, { "completion_length": 586.5, "epoch": 1.3159645232815964, "grad_norm": 0.7568463087081909, "kl": 0.12187549471855164, "learning_rate": 4.217230979699188e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4748 }, { "completion_length": 605.25, "epoch": 1.316241685144124, "grad_norm": 0.0, "kl": 0.12838469445705414, "learning_rate": 4.216912784312251e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4749 }, { "completion_length": 658.0, "epoch": 1.3165188470066518, "grad_norm": 0.0, "kl": 0.1494973748922348, "learning_rate": 4.216594536274743e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4750 }, { "completion_length": 660.75, "epoch": 1.3167960088691797, "grad_norm": 0.0, "kl": 0.10795436799526215, "learning_rate": 4.21627623559642e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4751 }, { "completion_length": 661.75, "epoch": 1.3170731707317074, "grad_norm": 0.0, "kl": 0.13554613292217255, "learning_rate": 4.215957882287044e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4752 }, { "completion_length": 710.25, "epoch": 1.317350332594235, "grad_norm": 0.0, "kl": 0.15427958965301514, "learning_rate": 4.215639476356379e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4753 }, { "completion_length": 674.0, "epoch": 1.3176274944567627, "grad_norm": 0.0, "kl": 0.12505881488323212, "learning_rate": 4.215321017814187e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4754 }, { "completion_length": 682.25, "epoch": 1.3179046563192904, "grad_norm": 0.3215292990207672, "kl": 0.12504693865776062, "learning_rate": 4.215002506670236e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4755 }, { "completion_length": 693.75, "epoch": 1.3181818181818181, "grad_norm": 0.0, "kl": 0.10110928118228912, "learning_rate": 4.214683942934291e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4756 }, { "completion_length": 534.25, "epoch": 1.3184589800443458, "grad_norm": 0.0, "kl": 0.19644257426261902, "learning_rate": 4.214365326616124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4757 }, { "completion_length": 575.75, "epoch": 1.3187361419068737, "grad_norm": 0.0, "kl": 0.15667855739593506, "learning_rate": 4.214046657725503e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4758 }, { "completion_length": 952.0, "epoch": 1.3190133037694014, "grad_norm": 0.21910786628723145, "kl": 0.11410564184188843, "learning_rate": 4.213727936272201e-06, "loss": -0.0, "reward": 5.59375, "reward_std": 0.3125, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.59375, "step": 4759 }, { "completion_length": 617.5, "epoch": 1.319290465631929, "grad_norm": 0.0, "kl": 0.11806105822324753, "learning_rate": 4.213409162265993e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4760 }, { "completion_length": 609.75, "epoch": 1.3195676274944568, "grad_norm": 0.36331653594970703, "kl": 0.13865648210048676, "learning_rate": 4.213090335716654e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4761 }, { "completion_length": 677.75, "epoch": 1.3198447893569845, "grad_norm": 0.3008950650691986, "kl": 0.1279648095369339, "learning_rate": 4.21277145663396e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4762 }, { "completion_length": 599.25, "epoch": 1.3201219512195121, "grad_norm": 0.0, "kl": 0.19713255763053894, "learning_rate": 4.212452525027689e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4763 }, { "completion_length": 566.5, "epoch": 1.3203991130820398, "grad_norm": 0.0, "kl": 0.14258717000484467, "learning_rate": 4.212133540907625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4764 }, { "completion_length": 659.5, "epoch": 1.3206762749445677, "grad_norm": 0.0, "kl": 0.118719682097435, "learning_rate": 4.211814504283546e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4765 }, { "completion_length": 656.5, "epoch": 1.3209534368070954, "grad_norm": 0.0, "kl": 0.2715755105018616, "learning_rate": 4.211495415165238e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4766 }, { "completion_length": 618.25, "epoch": 1.3212305986696231, "grad_norm": 0.0, "kl": 0.11082355678081512, "learning_rate": 4.211176273562485e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4767 }, { "completion_length": 590.5, "epoch": 1.3215077605321508, "grad_norm": 0.3479204773902893, "kl": 0.15222413837909698, "learning_rate": 4.210857079485074e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4768 }, { "completion_length": 731.0, "epoch": 1.3217849223946785, "grad_norm": 0.0, "kl": 0.24971044063568115, "learning_rate": 4.210537832942794e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4769 }, { "completion_length": 653.0, "epoch": 1.3220620842572062, "grad_norm": 0.0, "kl": 0.13997802138328552, "learning_rate": 4.210218533945433e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4770 }, { "completion_length": 591.25, "epoch": 1.3223392461197339, "grad_norm": 0.0, "kl": 0.122789166867733, "learning_rate": 4.209899182502784e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4771 }, { "completion_length": 609.25, "epoch": 1.3226164079822618, "grad_norm": 0.0, "kl": 0.1688203364610672, "learning_rate": 4.2095797786246415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4772 }, { "completion_length": 616.5, "epoch": 1.3228935698447892, "grad_norm": 0.0, "kl": 0.12609274685382843, "learning_rate": 4.209260322320797e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4773 }, { "completion_length": 627.25, "epoch": 1.3231707317073171, "grad_norm": 0.37843871116638184, "kl": 0.11774848401546478, "learning_rate": 4.2089408136010505e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4774 }, { "completion_length": 629.5, "epoch": 1.3234478935698448, "grad_norm": 0.0, "kl": 0.11938861757516861, "learning_rate": 4.208621252475198e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4775 }, { "completion_length": 608.5, "epoch": 1.3237250554323725, "grad_norm": 0.3612123131752014, "kl": 0.11491985619068146, "learning_rate": 4.2083016389530385e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4776 }, { "completion_length": 746.0, "epoch": 1.3240022172949002, "grad_norm": 0.0, "kl": 0.15061716735363007, "learning_rate": 4.207981973044374e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4777 }, { "completion_length": 547.75, "epoch": 1.3242793791574279, "grad_norm": 0.3812443017959595, "kl": 0.12067222595214844, "learning_rate": 4.2076622547590085e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4778 }, { "completion_length": 713.75, "epoch": 1.3245565410199558, "grad_norm": 0.0, "kl": 0.1173282191157341, "learning_rate": 4.207342484106745e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4779 }, { "completion_length": 677.0, "epoch": 1.3248337028824833, "grad_norm": 0.3194526433944702, "kl": 0.11847644299268723, "learning_rate": 4.207022661097389e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4780 }, { "completion_length": 671.0, "epoch": 1.3251108647450112, "grad_norm": 0.0, "kl": 0.11169297993183136, "learning_rate": 4.206702785740749e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4781 }, { "completion_length": 527.0, "epoch": 1.3253880266075388, "grad_norm": 0.0, "kl": 0.12696167826652527, "learning_rate": 4.206382858046636e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4782 }, { "completion_length": 685.25, "epoch": 1.3256651884700665, "grad_norm": 0.31026196479797363, "kl": 0.16986985504627228, "learning_rate": 4.206062878024858e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4783 }, { "completion_length": 593.75, "epoch": 1.3259423503325942, "grad_norm": 0.3881186842918396, "kl": 0.1583462506532669, "learning_rate": 4.205742845685228e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4784 }, { "completion_length": 543.75, "epoch": 1.326219512195122, "grad_norm": 0.0, "kl": 0.12312302738428116, "learning_rate": 4.205422761037561e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4785 }, { "completion_length": 643.5, "epoch": 1.3264966740576496, "grad_norm": 0.29102638363838196, "kl": 0.13759329915046692, "learning_rate": 4.205102624091673e-06, "loss": -0.0, "reward": 5.59375, "reward_std": 0.3125, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 4786 }, { "completion_length": 688.0, "epoch": 1.3267738359201773, "grad_norm": 0.0, "kl": 0.12976177036762238, "learning_rate": 4.20478243485738e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4787 }, { "completion_length": 572.5, "epoch": 1.3270509977827052, "grad_norm": 0.0, "kl": 0.16887636482715607, "learning_rate": 4.2044621933445015e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4788 }, { "completion_length": 620.75, "epoch": 1.3273281596452329, "grad_norm": 0.0, "kl": 0.14868704974651337, "learning_rate": 4.204141899562858e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4789 }, { "completion_length": 630.25, "epoch": 1.3276053215077606, "grad_norm": 0.0, "kl": 0.13927939534187317, "learning_rate": 4.203821553522271e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4790 }, { "completion_length": 577.0, "epoch": 1.3278824833702882, "grad_norm": 0.0, "kl": 0.16377581655979156, "learning_rate": 4.203501155232565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4791 }, { "completion_length": 637.0, "epoch": 1.328159645232816, "grad_norm": 0.0, "kl": 0.11906582117080688, "learning_rate": 4.203180704703566e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4792 }, { "completion_length": 659.0, "epoch": 1.3284368070953436, "grad_norm": 0.0, "kl": 0.14410461485385895, "learning_rate": 4.202860201945099e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4793 }, { "completion_length": 699.75, "epoch": 1.3287139689578713, "grad_norm": 0.0, "kl": 0.16140460968017578, "learning_rate": 4.202539646966993e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4794 }, { "completion_length": 602.25, "epoch": 1.3289911308203992, "grad_norm": 0.0, "kl": 0.1438666135072708, "learning_rate": 4.202219039779078e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4795 }, { "completion_length": 726.25, "epoch": 1.329268292682927, "grad_norm": 0.0, "kl": 0.1321973353624344, "learning_rate": 4.201898380391188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4796 }, { "completion_length": 628.25, "epoch": 1.3295454545454546, "grad_norm": 0.0, "kl": 0.16989350318908691, "learning_rate": 4.201577668813153e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4797 }, { "completion_length": 577.0, "epoch": 1.3298226164079823, "grad_norm": 0.0, "kl": 0.12232669442892075, "learning_rate": 4.2012569050548096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4798 }, { "completion_length": 591.5, "epoch": 1.33009977827051, "grad_norm": 0.0, "kl": 0.17357777059078217, "learning_rate": 4.2009360891259934e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4799 }, { "completion_length": 699.5, "epoch": 1.3303769401330376, "grad_norm": 0.30976617336273193, "kl": 0.2636917531490326, "learning_rate": 4.200615221036544e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4800 }, { "completion_length": 616.75, "epoch": 1.3306541019955653, "grad_norm": 0.0, "kl": 0.131526917219162, "learning_rate": 4.200294300796299e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4801 }, { "completion_length": 614.25, "epoch": 1.3309312638580932, "grad_norm": 0.0, "kl": 0.1379735767841339, "learning_rate": 4.199973328415102e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4802 }, { "completion_length": 606.25, "epoch": 1.331208425720621, "grad_norm": 0.35807904601097107, "kl": 0.12872521579265594, "learning_rate": 4.199652303902794e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4803 }, { "completion_length": 948.75, "epoch": 1.3314855875831486, "grad_norm": 0.3258776366710663, "kl": 0.10263606160879135, "learning_rate": 4.1993312272692214e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4804 }, { "completion_length": 545.0, "epoch": 1.3317627494456763, "grad_norm": 0.4494006931781769, "kl": 0.21551622450351715, "learning_rate": 4.199010098524228e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4805 }, { "completion_length": 736.75, "epoch": 1.332039911308204, "grad_norm": 0.0, "kl": 0.10754672437906265, "learning_rate": 4.198688917677662e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4806 }, { "completion_length": 573.5, "epoch": 1.3323170731707317, "grad_norm": 0.0, "kl": 0.20855411887168884, "learning_rate": 4.198367684739375e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4807 }, { "completion_length": 613.0, "epoch": 1.3325942350332594, "grad_norm": 0.3297010362148285, "kl": 0.11926110833883286, "learning_rate": 4.1980463997192146e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4808 }, { "completion_length": 616.25, "epoch": 1.3328713968957873, "grad_norm": 0.0, "kl": 0.1283949911594391, "learning_rate": 4.197725062627035e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4809 }, { "completion_length": 599.75, "epoch": 1.3331485587583147, "grad_norm": 0.0, "kl": 0.10379406064748764, "learning_rate": 4.197403673472691e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4810 }, { "completion_length": 713.25, "epoch": 1.3334257206208426, "grad_norm": 0.3776475489139557, "kl": 0.11534597724676132, "learning_rate": 4.197082232266037e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4811 }, { "completion_length": 626.5, "epoch": 1.3337028824833703, "grad_norm": 0.0, "kl": 0.12992841005325317, "learning_rate": 4.19676073901693e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4812 }, { "completion_length": 567.25, "epoch": 1.333980044345898, "grad_norm": 1.0918937921524048, "kl": 0.21009564399719238, "learning_rate": 4.19643919373523e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4813 }, { "completion_length": 570.0, "epoch": 1.3342572062084257, "grad_norm": 0.3883982300758362, "kl": 0.19900324940681458, "learning_rate": 4.196117596430797e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4814 }, { "completion_length": 662.5, "epoch": 1.3345343680709534, "grad_norm": 0.0, "kl": 0.16816429793834686, "learning_rate": 4.195795947113493e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4815 }, { "completion_length": 652.0, "epoch": 1.3348115299334813, "grad_norm": 0.0, "kl": 0.10530295968055725, "learning_rate": 4.195474245793181e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4816 }, { "completion_length": 574.0, "epoch": 1.3350886917960088, "grad_norm": 0.0, "kl": 0.15923868119716644, "learning_rate": 4.195152492479727e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4817 }, { "completion_length": 613.25, "epoch": 1.3353658536585367, "grad_norm": 0.0, "kl": 0.22846342623233795, "learning_rate": 4.194830687182999e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4818 }, { "completion_length": 691.25, "epoch": 1.3356430155210643, "grad_norm": 0.0, "kl": 0.16058692336082458, "learning_rate": 4.1945088299128635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4819 }, { "completion_length": 635.75, "epoch": 1.335920177383592, "grad_norm": 0.41329169273376465, "kl": 0.12364614009857178, "learning_rate": 4.19418692067919e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4820 }, { "completion_length": 600.5, "epoch": 1.3361973392461197, "grad_norm": 0.37663841247558594, "kl": 0.14215229451656342, "learning_rate": 4.193864959491853e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4821 }, { "completion_length": 639.25, "epoch": 1.3364745011086474, "grad_norm": 0.0, "kl": 0.17078439891338348, "learning_rate": 4.193542946360723e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4822 }, { "completion_length": 589.0, "epoch": 1.336751662971175, "grad_norm": 0.0, "kl": 0.14925570785999298, "learning_rate": 4.193220881295675e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4823 }, { "completion_length": 583.75, "epoch": 1.3370288248337028, "grad_norm": 0.0, "kl": 0.13952256739139557, "learning_rate": 4.192898764306588e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4824 }, { "completion_length": 712.25, "epoch": 1.3373059866962307, "grad_norm": 0.333782434463501, "kl": 0.13975419104099274, "learning_rate": 4.192576595403337e-06, "loss": 0.0, "reward": 3.1875, "reward_std": 3.0982184410095215, "rewards/confident_score_func": 0.75, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 4825 }, { "completion_length": 536.25, "epoch": 1.3375831485587584, "grad_norm": 0.0, "kl": 0.1455431878566742, "learning_rate": 4.192254374595803e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4826 }, { "completion_length": 629.75, "epoch": 1.337860310421286, "grad_norm": 0.39927637577056885, "kl": 0.1322970986366272, "learning_rate": 4.191932101893868e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4827 }, { "completion_length": 707.75, "epoch": 1.3381374722838137, "grad_norm": 0.0, "kl": 0.13585388660430908, "learning_rate": 4.191609777307413e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4828 }, { "completion_length": 649.5, "epoch": 1.3384146341463414, "grad_norm": 0.36915117502212524, "kl": 0.12359916418790817, "learning_rate": 4.191287400846322e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4829 }, { "completion_length": 559.75, "epoch": 1.3386917960088691, "grad_norm": 0.0, "kl": 0.15596012771129608, "learning_rate": 4.190964972520483e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4830 }, { "completion_length": 633.5, "epoch": 1.3389689578713968, "grad_norm": 0.0, "kl": 0.19205106794834137, "learning_rate": 4.190642492339782e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4831 }, { "completion_length": 603.0, "epoch": 1.3392461197339247, "grad_norm": 0.0, "kl": 0.1388290673494339, "learning_rate": 4.190319960314109e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4832 }, { "completion_length": 591.25, "epoch": 1.3395232815964524, "grad_norm": 0.0, "kl": 0.16009338200092316, "learning_rate": 4.189997376453354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4833 }, { "completion_length": 595.0, "epoch": 1.33980044345898, "grad_norm": 0.3675384521484375, "kl": 0.1445954442024231, "learning_rate": 4.189674740767411e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4834 }, { "completion_length": 537.5, "epoch": 1.3400776053215078, "grad_norm": 0.0, "kl": 0.17670927941799164, "learning_rate": 4.189352053266171e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4835 }, { "completion_length": 568.75, "epoch": 1.3403547671840355, "grad_norm": 0.0, "kl": 0.5061236619949341, "learning_rate": 4.1890293139595324e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4836 }, { "completion_length": 743.75, "epoch": 1.3406319290465631, "grad_norm": 0.0, "kl": 0.13622763752937317, "learning_rate": 4.18870652285739e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4837 }, { "completion_length": 676.0, "epoch": 1.3409090909090908, "grad_norm": 0.0, "kl": 0.1389622837305069, "learning_rate": 4.188383679969643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4838 }, { "completion_length": 587.25, "epoch": 1.3411862527716187, "grad_norm": 0.4004455804824829, "kl": 0.16401292383670807, "learning_rate": 4.188060785306192e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4839 }, { "completion_length": 655.25, "epoch": 1.3414634146341464, "grad_norm": 0.35022369027137756, "kl": 0.19305534660816193, "learning_rate": 4.187737838876941e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4840 }, { "completion_length": 553.25, "epoch": 1.341740576496674, "grad_norm": 0.4854244589805603, "kl": 0.19077403843402863, "learning_rate": 4.18741484069179e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4841 }, { "completion_length": 600.75, "epoch": 1.3420177383592018, "grad_norm": 0.0, "kl": 0.15093736350536346, "learning_rate": 4.187091790760644e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4842 }, { "completion_length": 700.25, "epoch": 1.3422949002217295, "grad_norm": 0.0, "kl": 0.3188578486442566, "learning_rate": 4.186768689093412e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4843 }, { "completion_length": 504.25, "epoch": 1.3425720620842572, "grad_norm": 0.0, "kl": 0.13400399684906006, "learning_rate": 4.186445535700002e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4844 }, { "completion_length": 598.5, "epoch": 1.3428492239467849, "grad_norm": 0.0, "kl": 0.15935535728931427, "learning_rate": 4.186122330590322e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4845 }, { "completion_length": 560.0, "epoch": 1.3431263858093128, "grad_norm": 0.0, "kl": 0.19712945818901062, "learning_rate": 4.185799073774284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4846 }, { "completion_length": 531.75, "epoch": 1.3434035476718402, "grad_norm": 0.3961902856826782, "kl": 0.13505619764328003, "learning_rate": 4.185475765261801e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4847 }, { "completion_length": 676.75, "epoch": 1.3436807095343681, "grad_norm": 0.47800344228744507, "kl": 0.11039368808269501, "learning_rate": 4.185152405062788e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4848 }, { "completion_length": 642.75, "epoch": 1.3439578713968958, "grad_norm": 0.0, "kl": 0.16113749146461487, "learning_rate": 4.184828993187161e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4849 }, { "completion_length": 665.75, "epoch": 1.3442350332594235, "grad_norm": 0.0, "kl": 0.13429345190525055, "learning_rate": 4.184505529644838e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4850 }, { "completion_length": 665.75, "epoch": 1.3445121951219512, "grad_norm": 0.0, "kl": 0.17777763307094574, "learning_rate": 4.1841820144457366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4851 }, { "completion_length": 647.0, "epoch": 1.3447893569844789, "grad_norm": 0.0, "kl": 0.1414501816034317, "learning_rate": 4.183858447599779e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4852 }, { "completion_length": 602.75, "epoch": 1.3450665188470068, "grad_norm": 0.0, "kl": 0.15949758887290955, "learning_rate": 4.183534829116888e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4853 }, { "completion_length": 586.0, "epoch": 1.3453436807095343, "grad_norm": 0.0, "kl": 0.1396716982126236, "learning_rate": 4.183211159006987e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4854 }, { "completion_length": 793.0, "epoch": 1.3456208425720622, "grad_norm": 0.0, "kl": 0.12265309691429138, "learning_rate": 4.182887437280002e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4855 }, { "completion_length": 602.0, "epoch": 1.3458980044345898, "grad_norm": 0.0, "kl": 0.16012713313102722, "learning_rate": 4.182563663945859e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4856 }, { "completion_length": 619.5, "epoch": 1.3461751662971175, "grad_norm": 0.0, "kl": 0.1526883989572525, "learning_rate": 4.182239839014488e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4857 }, { "completion_length": 698.75, "epoch": 1.3464523281596452, "grad_norm": 0.0, "kl": 0.1171586886048317, "learning_rate": 4.181915962495821e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4858 }, { "completion_length": 526.75, "epoch": 1.346729490022173, "grad_norm": 0.0, "kl": 0.1787893921136856, "learning_rate": 4.181592034399786e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4859 }, { "completion_length": 608.0, "epoch": 1.3470066518847006, "grad_norm": 0.3793528079986572, "kl": 0.24407899379730225, "learning_rate": 4.181268054736319e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4860 }, { "completion_length": 693.5, "epoch": 1.3472838137472283, "grad_norm": 0.41152438521385193, "kl": 0.5682688355445862, "learning_rate": 4.1809440235153545e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4861 }, { "completion_length": 675.5, "epoch": 1.3475609756097562, "grad_norm": 0.0, "kl": 0.12741230428218842, "learning_rate": 4.180619940746829e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4862 }, { "completion_length": 566.0, "epoch": 1.3478381374722839, "grad_norm": 0.42462706565856934, "kl": 0.1766667366027832, "learning_rate": 4.1802958064406814e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4863 }, { "completion_length": 600.5, "epoch": 1.3481152993348116, "grad_norm": 0.37448281049728394, "kl": 0.37195584177970886, "learning_rate": 4.179971620606852e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4864 }, { "completion_length": 561.25, "epoch": 1.3483924611973392, "grad_norm": 0.0, "kl": 0.13681857287883759, "learning_rate": 4.179647383255281e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4865 }, { "completion_length": 604.75, "epoch": 1.348669623059867, "grad_norm": 0.3814867436885834, "kl": 0.14580519497394562, "learning_rate": 4.179323094395912e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4866 }, { "completion_length": 541.5, "epoch": 1.3489467849223946, "grad_norm": 0.0, "kl": 1.3879363536834717, "learning_rate": 4.178998754038689e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4867 }, { "completion_length": 624.0, "epoch": 1.3492239467849223, "grad_norm": 0.0, "kl": 0.15073886513710022, "learning_rate": 4.178674362193559e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4868 }, { "completion_length": 621.5, "epoch": 1.3495011086474502, "grad_norm": 0.0, "kl": 0.1501501351594925, "learning_rate": 4.17834991887047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4869 }, { "completion_length": 652.25, "epoch": 1.349778270509978, "grad_norm": 0.47845664620399475, "kl": 0.13562722504138947, "learning_rate": 4.178025424079371e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4870 }, { "completion_length": 687.0, "epoch": 1.3500554323725056, "grad_norm": 0.0, "kl": 0.13065096735954285, "learning_rate": 4.177700877830211e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4871 }, { "completion_length": 617.0, "epoch": 1.3503325942350333, "grad_norm": 0.0, "kl": 0.14549453556537628, "learning_rate": 4.177376280132946e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4872 }, { "completion_length": 589.25, "epoch": 1.350609756097561, "grad_norm": 0.0, "kl": 0.17837975919246674, "learning_rate": 4.1770516309975264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4873 }, { "completion_length": 595.0, "epoch": 1.3508869179600886, "grad_norm": 0.3639380633831024, "kl": 0.1248534619808197, "learning_rate": 4.176726930433911e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4874 }, { "completion_length": 579.25, "epoch": 1.3511640798226163, "grad_norm": 0.5204232931137085, "kl": 0.19502347707748413, "learning_rate": 4.176402178452055e-06, "loss": -0.0, "reward": 4.46875, "reward_std": 2.5625, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 4875 }, { "completion_length": 594.75, "epoch": 1.3514412416851442, "grad_norm": 0.4278841018676758, "kl": 0.18821756541728973, "learning_rate": 4.176077375061918e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4876 }, { "completion_length": 616.5, "epoch": 1.351718403547672, "grad_norm": 0.0, "kl": 0.24209082126617432, "learning_rate": 4.17575252027346e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4877 }, { "completion_length": 642.5, "epoch": 1.3519955654101996, "grad_norm": 0.0, "kl": 0.14263342320919037, "learning_rate": 4.175427614096643e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4878 }, { "completion_length": 569.75, "epoch": 1.3522727272727273, "grad_norm": 0.0, "kl": 0.15068158507347107, "learning_rate": 4.175102656541432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4879 }, { "completion_length": 477.75, "epoch": 1.352549889135255, "grad_norm": 0.0, "kl": 0.16391396522521973, "learning_rate": 4.1747776476177896e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4880 }, { "completion_length": 656.0, "epoch": 1.3528270509977827, "grad_norm": 0.5252708196640015, "kl": 0.5430231690406799, "learning_rate": 4.174452587335684e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4881 }, { "completion_length": 588.5, "epoch": 1.3531042128603104, "grad_norm": 0.3948718309402466, "kl": 0.2284865826368332, "learning_rate": 4.174127475705082e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4882 }, { "completion_length": 589.5, "epoch": 1.3533813747228383, "grad_norm": 0.0, "kl": 0.16332212090492249, "learning_rate": 4.173802312735956e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4883 }, { "completion_length": 689.5, "epoch": 1.3536585365853657, "grad_norm": 0.6067862510681152, "kl": 0.15919749438762665, "learning_rate": 4.173477098438276e-06, "loss": 0.0, "reward": 3.71875, "reward_std": 2.3460404872894287, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 4884 }, { "completion_length": 602.5, "epoch": 1.3539356984478936, "grad_norm": 0.40154847502708435, "kl": 0.1545640081167221, "learning_rate": 4.173151832822014e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4885 }, { "completion_length": 607.25, "epoch": 1.3542128603104213, "grad_norm": 0.5351694226264954, "kl": 1.3657838106155396, "learning_rate": 4.172826515897146e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4886 }, { "completion_length": 625.5, "epoch": 1.354490022172949, "grad_norm": 0.0, "kl": 0.13738346099853516, "learning_rate": 4.172501147673647e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4887 }, { "completion_length": 593.75, "epoch": 1.3547671840354767, "grad_norm": 0.0, "kl": 0.13773053884506226, "learning_rate": 4.172175728161496e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4888 }, { "completion_length": 613.0, "epoch": 1.3550443458980044, "grad_norm": 0.0, "kl": 0.16160239279270172, "learning_rate": 4.171850257370671e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4889 }, { "completion_length": 569.25, "epoch": 1.3553215077605323, "grad_norm": 0.0, "kl": 0.15610235929489136, "learning_rate": 4.1715247353111536e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4890 }, { "completion_length": 636.25, "epoch": 1.3555986696230597, "grad_norm": 0.37923121452331543, "kl": 0.20909126102924347, "learning_rate": 4.171199161992925e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4891 }, { "completion_length": 605.5, "epoch": 1.3558758314855877, "grad_norm": 0.0, "kl": 0.18828989565372467, "learning_rate": 4.170873537425971e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4892 }, { "completion_length": 620.0, "epoch": 1.3561529933481153, "grad_norm": 0.0, "kl": 0.14674808084964752, "learning_rate": 4.170547861620276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4893 }, { "completion_length": 581.5, "epoch": 1.356430155210643, "grad_norm": 0.0, "kl": 0.13841015100479126, "learning_rate": 4.170222134585827e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4894 }, { "completion_length": 592.0, "epoch": 1.3567073170731707, "grad_norm": 0.3565610349178314, "kl": 0.17802287638187408, "learning_rate": 4.169896356332614e-06, "loss": -0.0, "reward": 1.59375, "reward_std": 0.3125, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 4895 }, { "completion_length": 698.5, "epoch": 1.3569844789356984, "grad_norm": 0.0, "kl": 0.17108413577079773, "learning_rate": 4.169570526870626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4896 }, { "completion_length": 578.0, "epoch": 1.357261640798226, "grad_norm": 0.0, "kl": 0.1368759125471115, "learning_rate": 4.169244646209854e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4897 }, { "completion_length": 650.0, "epoch": 1.3575388026607538, "grad_norm": 0.0, "kl": 0.14248161017894745, "learning_rate": 4.168918714360295e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4898 }, { "completion_length": 588.75, "epoch": 1.3578159645232817, "grad_norm": 0.0, "kl": 1.7397804260253906, "learning_rate": 4.16859273133194e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4899 }, { "completion_length": 721.5, "epoch": 1.3580931263858094, "grad_norm": 0.0, "kl": 1.9357085227966309, "learning_rate": 4.168266697134788e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4900 }, { "completion_length": 571.5, "epoch": 1.358370288248337, "grad_norm": 0.0, "kl": 0.17529714107513428, "learning_rate": 4.1679406117788345e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4901 }, { "completion_length": 653.0, "epoch": 1.3586474501108647, "grad_norm": 0.0, "kl": 0.30326512455940247, "learning_rate": 4.167614475274082e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4902 }, { "completion_length": 691.25, "epoch": 1.3589246119733924, "grad_norm": 0.0, "kl": 0.1436881422996521, "learning_rate": 4.167288287630531e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4903 }, { "completion_length": 662.0, "epoch": 1.3592017738359201, "grad_norm": 0.0, "kl": 0.12318018078804016, "learning_rate": 4.166962048858184e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4904 }, { "completion_length": 564.75, "epoch": 1.3594789356984478, "grad_norm": 0.0, "kl": 0.19715017080307007, "learning_rate": 4.166635758967044e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4905 }, { "completion_length": 587.75, "epoch": 1.3597560975609757, "grad_norm": 0.0, "kl": 0.37385669350624084, "learning_rate": 4.166309417967119e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4906 }, { "completion_length": 583.75, "epoch": 1.3600332594235034, "grad_norm": 0.0, "kl": 0.1701204776763916, "learning_rate": 4.1659830258684165e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4907 }, { "completion_length": 568.0, "epoch": 1.360310421286031, "grad_norm": 0.0, "kl": 0.19353936612606049, "learning_rate": 4.165656582680945e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4908 }, { "completion_length": 602.25, "epoch": 1.3605875831485588, "grad_norm": 0.0, "kl": 0.1950853168964386, "learning_rate": 4.165330088414714e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4909 }, { "completion_length": 632.25, "epoch": 1.3608647450110865, "grad_norm": 0.0, "kl": 0.16692307591438293, "learning_rate": 4.165003543079738e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4910 }, { "completion_length": 730.5, "epoch": 1.3611419068736141, "grad_norm": 0.3215320110321045, "kl": 0.1301335096359253, "learning_rate": 4.164676946686028e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4911 }, { "completion_length": 650.25, "epoch": 1.3614190687361418, "grad_norm": 0.0, "kl": 0.31261909008026123, "learning_rate": 4.164350299243601e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4912 }, { "completion_length": 570.75, "epoch": 1.3616962305986697, "grad_norm": 0.0, "kl": 0.16874289512634277, "learning_rate": 4.164023600762476e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4913 }, { "completion_length": 627.75, "epoch": 1.3619733924611974, "grad_norm": 0.0, "kl": 0.18732842803001404, "learning_rate": 4.1636968512526676e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4914 }, { "completion_length": 617.5, "epoch": 1.362250554323725, "grad_norm": 0.0, "kl": 0.1498422920703888, "learning_rate": 4.163370050724198e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4915 }, { "completion_length": 605.75, "epoch": 1.3625277161862528, "grad_norm": 0.0, "kl": 0.1659778356552124, "learning_rate": 4.163043199187089e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4916 }, { "completion_length": 661.75, "epoch": 1.3628048780487805, "grad_norm": 0.0, "kl": 0.22211794555187225, "learning_rate": 4.162716296651362e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4917 }, { "completion_length": 598.5, "epoch": 1.3630820399113082, "grad_norm": 0.0, "kl": 0.15153995156288147, "learning_rate": 4.162389343127043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4918 }, { "completion_length": 661.0, "epoch": 1.3633592017738358, "grad_norm": 0.0, "kl": 0.13790670037269592, "learning_rate": 4.162062338624159e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4919 }, { "completion_length": 616.25, "epoch": 1.3636363636363638, "grad_norm": 0.4879029095172882, "kl": 0.1614266186952591, "learning_rate": 4.161735283152737e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4920 }, { "completion_length": 661.25, "epoch": 1.3639135254988912, "grad_norm": 0.0, "kl": 0.1524350941181183, "learning_rate": 4.161408176722806e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4921 }, { "completion_length": 684.5, "epoch": 1.3641906873614191, "grad_norm": 0.438778281211853, "kl": 0.16307325661182404, "learning_rate": 4.161081019344398e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4922 }, { "completion_length": 607.0, "epoch": 1.3644678492239468, "grad_norm": 0.0, "kl": 0.26874643564224243, "learning_rate": 4.160753811027545e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4923 }, { "completion_length": 567.0, "epoch": 1.3647450110864745, "grad_norm": 0.0, "kl": 0.2090844362974167, "learning_rate": 4.160426551782281e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4924 }, { "completion_length": 532.5, "epoch": 1.3650221729490022, "grad_norm": 0.0, "kl": 0.1367766112089157, "learning_rate": 4.160099241618642e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4925 }, { "completion_length": 589.5, "epoch": 1.3652993348115299, "grad_norm": 0.0, "kl": 0.1840418577194214, "learning_rate": 4.159771880546665e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4926 }, { "completion_length": 638.25, "epoch": 1.3655764966740578, "grad_norm": 0.0, "kl": 0.15896157920360565, "learning_rate": 4.159444468576388e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4927 }, { "completion_length": 604.0, "epoch": 1.3658536585365852, "grad_norm": 0.0, "kl": 0.23740337789058685, "learning_rate": 4.159117005717853e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4928 }, { "completion_length": 601.5, "epoch": 1.3661308203991132, "grad_norm": 0.0, "kl": 0.14083893597126007, "learning_rate": 4.158789491981101e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4929 }, { "completion_length": 702.5, "epoch": 1.3664079822616408, "grad_norm": 0.0, "kl": 0.1467159539461136, "learning_rate": 4.158461927376176e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4930 }, { "completion_length": 618.75, "epoch": 1.3666851441241685, "grad_norm": 0.0, "kl": 0.144155353307724, "learning_rate": 4.158134311913123e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4931 }, { "completion_length": 659.75, "epoch": 1.3669623059866962, "grad_norm": 0.3589378595352173, "kl": 0.13797704875469208, "learning_rate": 4.1578066456019885e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4932 }, { "completion_length": 589.75, "epoch": 1.367239467849224, "grad_norm": 0.0, "kl": 0.17558230459690094, "learning_rate": 4.15747892845282e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4933 }, { "completion_length": 550.75, "epoch": 1.3675166297117516, "grad_norm": 0.0, "kl": 0.2248098999261856, "learning_rate": 4.157151160475668e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4934 }, { "completion_length": 650.5, "epoch": 1.3677937915742793, "grad_norm": 0.0, "kl": 0.1548483967781067, "learning_rate": 4.156823341680583e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4935 }, { "completion_length": 598.5, "epoch": 1.3680709534368072, "grad_norm": 0.0, "kl": 0.15770426392555237, "learning_rate": 4.156495472077618e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4936 }, { "completion_length": 639.0, "epoch": 1.3683481152993349, "grad_norm": 0.0, "kl": 0.6636159420013428, "learning_rate": 4.156167551676828e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4937 }, { "completion_length": 656.25, "epoch": 1.3686252771618626, "grad_norm": 0.0, "kl": 0.22809362411499023, "learning_rate": 4.15583958048827e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4938 }, { "completion_length": 649.0, "epoch": 1.3689024390243902, "grad_norm": 0.0, "kl": 0.15088413655757904, "learning_rate": 4.1555115585219985e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4939 }, { "completion_length": 632.0, "epoch": 1.369179600886918, "grad_norm": 0.0, "kl": 0.17017744481563568, "learning_rate": 4.155183485788075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4940 }, { "completion_length": 564.75, "epoch": 1.3694567627494456, "grad_norm": 0.0, "kl": 0.1723494827747345, "learning_rate": 4.154855362296559e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4941 }, { "completion_length": 536.25, "epoch": 1.3697339246119733, "grad_norm": 0.0, "kl": 0.20583854615688324, "learning_rate": 4.154527188057513e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4942 }, { "completion_length": 659.25, "epoch": 1.3700110864745012, "grad_norm": 0.0, "kl": 0.16847820580005646, "learning_rate": 4.1541989630810015e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4943 }, { "completion_length": 614.0, "epoch": 1.370288248337029, "grad_norm": 0.0, "kl": 0.16526943445205688, "learning_rate": 4.153870687377089e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4944 }, { "completion_length": 620.0, "epoch": 1.3705654101995566, "grad_norm": 0.432473748922348, "kl": 0.1531299352645874, "learning_rate": 4.1535423609558424e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4945 }, { "completion_length": 644.5, "epoch": 1.3708425720620843, "grad_norm": 0.0, "kl": 0.16873379051685333, "learning_rate": 4.15321398382733e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4946 }, { "completion_length": 600.25, "epoch": 1.371119733924612, "grad_norm": 0.0, "kl": 0.16540752351284027, "learning_rate": 4.1528855560016214e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4947 }, { "completion_length": 696.25, "epoch": 1.3713968957871396, "grad_norm": 0.0, "kl": 0.12491027265787125, "learning_rate": 4.152557077488789e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4948 }, { "completion_length": 666.25, "epoch": 1.3716740576496673, "grad_norm": 0.0, "kl": 0.1445978879928589, "learning_rate": 4.152228548298906e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4949 }, { "completion_length": 580.25, "epoch": 1.3719512195121952, "grad_norm": 0.0, "kl": 0.1531137377023697, "learning_rate": 4.151899968442046e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4950 }, { "completion_length": 594.75, "epoch": 1.372228381374723, "grad_norm": 0.4970889389514923, "kl": 0.15541478991508484, "learning_rate": 4.151571337928285e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4951 }, { "completion_length": 634.0, "epoch": 1.3725055432372506, "grad_norm": 0.0, "kl": 0.18191014230251312, "learning_rate": 4.151242656767703e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4952 }, { "completion_length": 594.0, "epoch": 1.3727827050997783, "grad_norm": 0.566745936870575, "kl": 0.35130447149276733, "learning_rate": 4.150913924970377e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4953 }, { "completion_length": 553.75, "epoch": 1.373059866962306, "grad_norm": 0.0, "kl": 0.18345047533512115, "learning_rate": 4.150585142546388e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4954 }, { "completion_length": 762.5, "epoch": 1.3733370288248337, "grad_norm": 0.0, "kl": 0.13617263734340668, "learning_rate": 4.15025630950582e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4955 }, { "completion_length": 560.5, "epoch": 1.3736141906873613, "grad_norm": 0.40123966336250305, "kl": 0.2100527435541153, "learning_rate": 4.149927425858756e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4956 }, { "completion_length": 620.0, "epoch": 1.3738913525498893, "grad_norm": 0.0, "kl": 0.19915173947811127, "learning_rate": 4.149598491615279e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4957 }, { "completion_length": 632.5, "epoch": 1.3741685144124167, "grad_norm": 0.0, "kl": 0.1296883523464203, "learning_rate": 4.149269506785481e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4958 }, { "completion_length": 617.0, "epoch": 1.3744456762749446, "grad_norm": 0.0, "kl": 0.17338183522224426, "learning_rate": 4.148940471379446e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4959 }, { "completion_length": 653.75, "epoch": 1.3747228381374723, "grad_norm": 0.0, "kl": 0.1550336480140686, "learning_rate": 4.148611385407268e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4960 }, { "completion_length": 524.25, "epoch": 1.375, "grad_norm": 0.0, "kl": 0.2823772728443146, "learning_rate": 4.148282248879036e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4961 }, { "completion_length": 647.5, "epoch": 1.3752771618625277, "grad_norm": 0.0, "kl": 0.1500077098608017, "learning_rate": 4.147953061804845e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4962 }, { "completion_length": 699.5, "epoch": 1.3755543237250554, "grad_norm": 0.0, "kl": 0.19652360677719116, "learning_rate": 4.147623824194789e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4963 }, { "completion_length": 645.5, "epoch": 1.3758314855875833, "grad_norm": 0.0, "kl": 0.2684277892112732, "learning_rate": 4.1472945360589626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4964 }, { "completion_length": 723.0, "epoch": 1.3761086474501107, "grad_norm": 0.0, "kl": 0.1535390168428421, "learning_rate": 4.146965197407466e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4965 }, { "completion_length": 662.75, "epoch": 1.3763858093126387, "grad_norm": 0.0, "kl": 0.15219759941101074, "learning_rate": 4.146635808250398e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4966 }, { "completion_length": 669.75, "epoch": 1.3766629711751663, "grad_norm": 0.0, "kl": 0.155934676527977, "learning_rate": 4.14630636859786e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4967 }, { "completion_length": 556.0, "epoch": 1.376940133037694, "grad_norm": 0.7101463079452515, "kl": 0.16339024901390076, "learning_rate": 4.145976878459954e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4968 }, { "completion_length": 559.0, "epoch": 1.3772172949002217, "grad_norm": 0.44329169392585754, "kl": 0.21447886526584625, "learning_rate": 4.145647337846785e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4969 }, { "completion_length": 678.75, "epoch": 1.3774944567627494, "grad_norm": 0.0, "kl": 0.1829257607460022, "learning_rate": 4.145317746768456e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4970 }, { "completion_length": 739.5, "epoch": 1.377771618625277, "grad_norm": 0.0, "kl": 0.17073459923267365, "learning_rate": 4.144988105235077e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4971 }, { "completion_length": 568.0, "epoch": 1.3780487804878048, "grad_norm": 0.0, "kl": 0.1710282862186432, "learning_rate": 4.144658413256756e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4972 }, { "completion_length": 628.75, "epoch": 1.3783259423503327, "grad_norm": 0.0, "kl": 0.15523988008499146, "learning_rate": 4.144328670843603e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4973 }, { "completion_length": 619.75, "epoch": 1.3786031042128604, "grad_norm": 0.0, "kl": 0.17778915166854858, "learning_rate": 4.14399887800573e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4974 }, { "completion_length": 602.75, "epoch": 1.378880266075388, "grad_norm": 0.43832269310951233, "kl": 0.6120191216468811, "learning_rate": 4.14366903475325e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4975 }, { "completion_length": 609.5, "epoch": 1.3791574279379157, "grad_norm": 0.0, "kl": 0.24360689520835876, "learning_rate": 4.143339141096279e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4976 }, { "completion_length": 508.5, "epoch": 1.3794345898004434, "grad_norm": 0.47654175758361816, "kl": 0.16881714761257172, "learning_rate": 4.143009197044932e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4977 }, { "completion_length": 638.0, "epoch": 1.379711751662971, "grad_norm": 0.33784326910972595, "kl": 0.15581762790679932, "learning_rate": 4.1426792026093274e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4978 }, { "completion_length": 675.0, "epoch": 1.3799889135254988, "grad_norm": 0.0, "kl": 0.14771747589111328, "learning_rate": 4.142349157799585e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4979 }, { "completion_length": 573.25, "epoch": 1.3802660753880267, "grad_norm": 0.0, "kl": 0.1947171688079834, "learning_rate": 4.142019062625826e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4980 }, { "completion_length": 617.0, "epoch": 1.3805432372505544, "grad_norm": 0.0, "kl": 0.2506178021430969, "learning_rate": 4.141688917098174e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4981 }, { "completion_length": 648.25, "epoch": 1.380820399113082, "grad_norm": 0.0, "kl": 0.1441231518983841, "learning_rate": 4.141358721226751e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4982 }, { "completion_length": 609.75, "epoch": 1.3810975609756098, "grad_norm": 0.0, "kl": 0.16217605769634247, "learning_rate": 4.141028475021685e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4983 }, { "completion_length": 670.5, "epoch": 1.3813747228381374, "grad_norm": 0.41829097270965576, "kl": 0.2171529233455658, "learning_rate": 4.140698178493101e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4984 }, { "completion_length": 666.0, "epoch": 1.3816518847006651, "grad_norm": 0.0, "kl": 0.17337553203105927, "learning_rate": 4.1403678316511296e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4985 }, { "completion_length": 770.5, "epoch": 1.3819290465631928, "grad_norm": 0.0, "kl": 0.1711178570985794, "learning_rate": 4.1400374345059e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4986 }, { "completion_length": 639.5, "epoch": 1.3822062084257207, "grad_norm": 0.0, "kl": 0.18852348625659943, "learning_rate": 4.139706987067545e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4987 }, { "completion_length": 543.75, "epoch": 1.3824833702882484, "grad_norm": 0.5516958236694336, "kl": 0.1806556135416031, "learning_rate": 4.139376489346198e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4988 }, { "completion_length": 545.25, "epoch": 1.382760532150776, "grad_norm": 0.0, "kl": 0.2186812460422516, "learning_rate": 4.139045941351993e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4989 }, { "completion_length": 516.25, "epoch": 1.3830376940133038, "grad_norm": 0.0, "kl": 0.15265177190303802, "learning_rate": 4.138715343095069e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4990 }, { "completion_length": 688.0, "epoch": 1.3833148558758315, "grad_norm": 0.0, "kl": 0.1727498471736908, "learning_rate": 4.1383846945855605e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4991 }, { "completion_length": 651.75, "epoch": 1.3835920177383592, "grad_norm": 0.0, "kl": 0.18192319571971893, "learning_rate": 4.1380539958336095e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4992 }, { "completion_length": 590.5, "epoch": 1.3838691796008868, "grad_norm": 0.0, "kl": 0.1727706789970398, "learning_rate": 4.137723246849356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4993 }, { "completion_length": 504.0, "epoch": 1.3841463414634148, "grad_norm": 0.0, "kl": 0.1907602995634079, "learning_rate": 4.1373924476429435e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4994 }, { "completion_length": 610.5, "epoch": 1.3844235033259422, "grad_norm": 0.5330249667167664, "kl": 0.42659932374954224, "learning_rate": 4.1370615982245175e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4995 }, { "completion_length": 593.75, "epoch": 1.3847006651884701, "grad_norm": 0.0, "kl": 0.2015148103237152, "learning_rate": 4.136730698604221e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4996 }, { "completion_length": 608.5, "epoch": 1.3849778270509978, "grad_norm": 0.0, "kl": 0.15675728023052216, "learning_rate": 4.136399748792202e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4997 }, { "completion_length": 651.5, "epoch": 1.3852549889135255, "grad_norm": 0.0, "kl": 0.16384747624397278, "learning_rate": 4.136068748798612e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4998 }, { "completion_length": 702.5, "epoch": 1.3855321507760532, "grad_norm": 0.0, "kl": 0.14515018463134766, "learning_rate": 4.135737698633598e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 4999 }, { "completion_length": 617.5, "epoch": 1.3858093126385809, "grad_norm": 0.0, "kl": 0.13951528072357178, "learning_rate": 4.1354065983073136e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5000 }, { "completion_length": 590.25, "epoch": 1.3860864745011088, "grad_norm": 0.0, "kl": 0.18999917805194855, "learning_rate": 4.135075447829912e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5001 }, { "completion_length": 599.0, "epoch": 1.3863636363636362, "grad_norm": 0.0, "kl": 0.18219606578350067, "learning_rate": 4.134744247211547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5002 }, { "completion_length": 604.0, "epoch": 1.3866407982261642, "grad_norm": 0.0, "kl": 0.1542975753545761, "learning_rate": 4.1344129964623765e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5003 }, { "completion_length": 539.0, "epoch": 1.3869179600886918, "grad_norm": 0.0, "kl": 0.18782733380794525, "learning_rate": 4.13408169559256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5004 }, { "completion_length": 662.0, "epoch": 1.3871951219512195, "grad_norm": 0.4180011749267578, "kl": 0.18588165938854218, "learning_rate": 4.1337503446122545e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5005 }, { "completion_length": 621.25, "epoch": 1.3874722838137472, "grad_norm": 0.0, "kl": 0.32454371452331543, "learning_rate": 4.1334189435316215e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5006 }, { "completion_length": 695.0, "epoch": 1.387749445676275, "grad_norm": 0.0, "kl": 0.8492425084114075, "learning_rate": 4.133087492360825e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5007 }, { "completion_length": 626.25, "epoch": 1.3880266075388026, "grad_norm": 0.0, "kl": 0.16227534413337708, "learning_rate": 4.132755991110029e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5008 }, { "completion_length": 647.25, "epoch": 1.3883037694013303, "grad_norm": 0.0, "kl": 0.21703319251537323, "learning_rate": 4.132424439789399e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5009 }, { "completion_length": 594.5, "epoch": 1.3885809312638582, "grad_norm": 0.0, "kl": 0.25810864567756653, "learning_rate": 4.132092838409102e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5010 }, { "completion_length": 949.0, "epoch": 1.3888580931263859, "grad_norm": 0.0, "kl": 0.1317659467458725, "learning_rate": 4.131761186979306e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5011 }, { "completion_length": 689.0, "epoch": 1.3891352549889135, "grad_norm": 0.0, "kl": 0.18364064395427704, "learning_rate": 4.131429485510183e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5012 }, { "completion_length": 609.0, "epoch": 1.3894124168514412, "grad_norm": 0.0, "kl": 0.563903272151947, "learning_rate": 4.131097734011905e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5013 }, { "completion_length": 712.75, "epoch": 1.389689578713969, "grad_norm": 0.5489373207092285, "kl": 0.3073089122772217, "learning_rate": 4.1307659324946435e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5014 }, { "completion_length": 683.5, "epoch": 1.3899667405764966, "grad_norm": 0.0, "kl": 0.16797342896461487, "learning_rate": 4.130434080968576e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5015 }, { "completion_length": 605.75, "epoch": 1.3902439024390243, "grad_norm": 0.5342823266983032, "kl": 0.1563829481601715, "learning_rate": 4.130102179443877e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5016 }, { "completion_length": 605.5, "epoch": 1.3905210643015522, "grad_norm": 0.0, "kl": 0.20628708600997925, "learning_rate": 4.1297702279307235e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5017 }, { "completion_length": 606.5, "epoch": 1.3907982261640799, "grad_norm": 0.0, "kl": 0.20306634902954102, "learning_rate": 4.1294382264392994e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5018 }, { "completion_length": 607.0, "epoch": 1.3910753880266076, "grad_norm": 0.0, "kl": 0.1709171086549759, "learning_rate": 4.129106174979782e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5019 }, { "completion_length": 558.0, "epoch": 1.3913525498891353, "grad_norm": 0.0, "kl": 0.2587142884731293, "learning_rate": 4.128774073562355e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5020 }, { "completion_length": 588.0, "epoch": 1.391629711751663, "grad_norm": 0.0, "kl": 0.3751722276210785, "learning_rate": 4.128441922197203e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5021 }, { "completion_length": 655.25, "epoch": 1.3919068736141906, "grad_norm": 0.0, "kl": 0.15197810530662537, "learning_rate": 4.128109720894512e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5022 }, { "completion_length": 715.75, "epoch": 1.3921840354767183, "grad_norm": 0.0, "kl": 0.2353105992078781, "learning_rate": 4.127777469664468e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5023 }, { "completion_length": 509.5, "epoch": 1.3924611973392462, "grad_norm": 0.0, "kl": 0.2322021871805191, "learning_rate": 4.12744516851726e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5024 }, { "completion_length": 656.5, "epoch": 1.392738359201774, "grad_norm": 0.4884937107563019, "kl": 0.6592103838920593, "learning_rate": 4.127112817463079e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5025 }, { "completion_length": 657.5, "epoch": 1.3930155210643016, "grad_norm": 0.0, "kl": 0.18557871878147125, "learning_rate": 4.126780416512116e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5026 }, { "completion_length": 642.5, "epoch": 1.3932926829268293, "grad_norm": 0.0, "kl": 0.14810222387313843, "learning_rate": 4.126447965674566e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5027 }, { "completion_length": 554.0, "epoch": 1.393569844789357, "grad_norm": 0.0, "kl": 0.40676480531692505, "learning_rate": 4.126115464960621e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5028 }, { "completion_length": 623.75, "epoch": 1.3938470066518847, "grad_norm": 0.0, "kl": 0.1655089110136032, "learning_rate": 4.125782914380482e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5029 }, { "completion_length": 617.25, "epoch": 1.3941241685144123, "grad_norm": 0.4486280083656311, "kl": 0.13981245458126068, "learning_rate": 4.125450313944342e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5030 }, { "completion_length": 662.75, "epoch": 1.3944013303769403, "grad_norm": 0.4261506199836731, "kl": 204.4852752685547, "learning_rate": 4.1251176636624026e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5031 }, { "completion_length": 633.75, "epoch": 1.3946784922394677, "grad_norm": 0.0, "kl": 0.16511572897434235, "learning_rate": 4.124784963544865e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5032 }, { "completion_length": 585.25, "epoch": 1.3949556541019956, "grad_norm": 0.0, "kl": 0.16359390318393707, "learning_rate": 4.1244522136019315e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5033 }, { "completion_length": 554.75, "epoch": 1.3952328159645233, "grad_norm": 0.0, "kl": 0.19662699103355408, "learning_rate": 4.124119413843807e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5034 }, { "completion_length": 671.75, "epoch": 1.395509977827051, "grad_norm": 0.0, "kl": 0.4101179838180542, "learning_rate": 4.123786564280695e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5035 }, { "completion_length": 523.25, "epoch": 1.3957871396895787, "grad_norm": 0.0, "kl": 0.19127710163593292, "learning_rate": 4.123453664922805e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5036 }, { "completion_length": 582.25, "epoch": 1.3960643015521064, "grad_norm": 0.0, "kl": 0.2284262776374817, "learning_rate": 4.123120715780344e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5037 }, { "completion_length": 618.75, "epoch": 1.3963414634146343, "grad_norm": 0.0, "kl": 0.20124000310897827, "learning_rate": 4.122787716863522e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5038 }, { "completion_length": 886.5, "epoch": 1.3966186252771617, "grad_norm": 0.5107705593109131, "kl": 78.9078369140625, "learning_rate": 4.122454668182552e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5039 }, { "completion_length": 638.5, "epoch": 1.3968957871396896, "grad_norm": 0.0, "kl": 0.13194817304611206, "learning_rate": 4.122121569747647e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5040 }, { "completion_length": 649.0, "epoch": 1.3971729490022173, "grad_norm": 0.0, "kl": 0.24139173328876495, "learning_rate": 4.121788421569021e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5041 }, { "completion_length": 535.25, "epoch": 1.397450110864745, "grad_norm": 0.0, "kl": 0.16570232808589935, "learning_rate": 4.12145522365689e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5042 }, { "completion_length": 671.75, "epoch": 1.3977272727272727, "grad_norm": 0.0, "kl": 0.15912817418575287, "learning_rate": 4.121121976021473e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5043 }, { "completion_length": 624.5, "epoch": 1.3980044345898004, "grad_norm": 0.0, "kl": 0.27856263518333435, "learning_rate": 4.120788678672989e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5044 }, { "completion_length": 639.75, "epoch": 1.398281596452328, "grad_norm": 0.0, "kl": 0.2116994857788086, "learning_rate": 4.1204553316216574e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5045 }, { "completion_length": 634.5, "epoch": 1.3985587583148558, "grad_norm": 0.0, "kl": 0.21224766969680786, "learning_rate": 4.120121934877702e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5046 }, { "completion_length": 593.75, "epoch": 1.3988359201773837, "grad_norm": 0.0, "kl": 0.18633274734020233, "learning_rate": 4.119788488451347e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5047 }, { "completion_length": 543.25, "epoch": 1.3991130820399114, "grad_norm": 0.0, "kl": 6.724603176116943, "learning_rate": 4.1194549923528175e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5048 }, { "completion_length": 705.0, "epoch": 1.399390243902439, "grad_norm": 0.0, "kl": 0.18325692415237427, "learning_rate": 4.119121446592339e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5049 }, { "completion_length": 601.25, "epoch": 1.3996674057649667, "grad_norm": 0.0, "kl": 0.19219015538692474, "learning_rate": 4.118787851180142e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5050 }, { "completion_length": 560.5, "epoch": 1.3999445676274944, "grad_norm": 0.0, "kl": 0.16380318999290466, "learning_rate": 4.118454206126456e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5051 }, { "completion_length": 637.75, "epoch": 1.400221729490022, "grad_norm": 0.0, "kl": 0.2050427347421646, "learning_rate": 4.118120511441512e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5052 }, { "completion_length": 579.75, "epoch": 1.4004988913525498, "grad_norm": 0.6744911074638367, "kl": 0.1772429198026657, "learning_rate": 4.117786767135541e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5053 }, { "completion_length": 631.0, "epoch": 1.4007760532150777, "grad_norm": 0.0, "kl": 0.6023733019828796, "learning_rate": 4.117452973218782e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5054 }, { "completion_length": 538.25, "epoch": 1.4010532150776054, "grad_norm": 0.0, "kl": 0.22979699075222015, "learning_rate": 4.117119129701468e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5055 }, { "completion_length": 636.0, "epoch": 1.401330376940133, "grad_norm": 0.0, "kl": 0.2750307321548462, "learning_rate": 4.116785236593837e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5056 }, { "completion_length": 762.0, "epoch": 1.4016075388026608, "grad_norm": 0.0, "kl": 0.13720935583114624, "learning_rate": 4.1164512939061284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5057 }, { "completion_length": 709.5, "epoch": 1.4018847006651884, "grad_norm": 0.0, "kl": 0.14736011624336243, "learning_rate": 4.1161173016485835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5058 }, { "completion_length": 646.25, "epoch": 1.4021618625277161, "grad_norm": 0.0, "kl": 0.18656562268733978, "learning_rate": 4.115783259831444e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5059 }, { "completion_length": 579.25, "epoch": 1.4024390243902438, "grad_norm": 0.0, "kl": 0.22418278455734253, "learning_rate": 4.1154491684649526e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5060 }, { "completion_length": 631.75, "epoch": 1.4027161862527717, "grad_norm": 0.5088966488838196, "kl": 0.4218330383300781, "learning_rate": 4.115115027559355e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5061 }, { "completion_length": 599.25, "epoch": 1.4029933481152994, "grad_norm": 0.0, "kl": 0.23053477704524994, "learning_rate": 4.1147808371249e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5062 }, { "completion_length": 593.75, "epoch": 1.403270509977827, "grad_norm": 0.39416444301605225, "kl": 0.1842641979455948, "learning_rate": 4.114446597171833e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5063 }, { "completion_length": 753.5, "epoch": 1.4035476718403548, "grad_norm": 0.0, "kl": 0.13092023134231567, "learning_rate": 4.114112307710405e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5064 }, { "completion_length": 629.0, "epoch": 1.4038248337028825, "grad_norm": 0.0, "kl": 0.2090948522090912, "learning_rate": 4.113777968750866e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5065 }, { "completion_length": 635.25, "epoch": 1.4041019955654102, "grad_norm": 0.0, "kl": 0.3584511876106262, "learning_rate": 4.113443580303471e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5066 }, { "completion_length": 644.5, "epoch": 1.4043791574279378, "grad_norm": 0.0, "kl": 0.5539840459823608, "learning_rate": 4.113109142378473e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5067 }, { "completion_length": 686.0, "epoch": 1.4046563192904657, "grad_norm": 0.39467868208885193, "kl": 0.18595042824745178, "learning_rate": 4.112774654986128e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5068 }, { "completion_length": 767.75, "epoch": 1.4049334811529932, "grad_norm": 0.0, "kl": 0.16462372243404388, "learning_rate": 4.112440118136692e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5069 }, { "completion_length": 521.75, "epoch": 1.4052106430155211, "grad_norm": 0.0, "kl": 0.14944851398468018, "learning_rate": 4.112105531840427e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5070 }, { "completion_length": 646.25, "epoch": 1.4054878048780488, "grad_norm": 0.6796884536743164, "kl": 0.1994069218635559, "learning_rate": 4.111770896107591e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5071 }, { "completion_length": 576.75, "epoch": 1.4057649667405765, "grad_norm": 0.0, "kl": 0.16111892461776733, "learning_rate": 4.1114362109484465e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5072 }, { "completion_length": 654.0, "epoch": 1.4060421286031042, "grad_norm": 0.4592016339302063, "kl": 0.20873406529426575, "learning_rate": 4.111101476373256e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5073 }, { "completion_length": 701.0, "epoch": 1.4063192904656319, "grad_norm": 0.0, "kl": 0.1371215283870697, "learning_rate": 4.110766692392286e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5074 }, { "completion_length": 563.0, "epoch": 1.4065964523281598, "grad_norm": 0.0, "kl": 0.2768014669418335, "learning_rate": 4.110431859015802e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5075 }, { "completion_length": 612.0, "epoch": 1.4068736141906872, "grad_norm": 0.0, "kl": 0.15217092633247375, "learning_rate": 4.110096976254072e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5076 }, { "completion_length": 778.75, "epoch": 1.4071507760532151, "grad_norm": 0.0, "kl": 0.132700577378273, "learning_rate": 4.109762044117365e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5077 }, { "completion_length": 595.25, "epoch": 1.4074279379157428, "grad_norm": 0.0, "kl": 0.15417027473449707, "learning_rate": 4.109427062615954e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5078 }, { "completion_length": 699.25, "epoch": 1.4077050997782705, "grad_norm": 0.0, "kl": 0.19842217862606049, "learning_rate": 4.109092031760109e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5079 }, { "completion_length": 533.5, "epoch": 1.4079822616407982, "grad_norm": 0.0, "kl": 0.21953007578849792, "learning_rate": 4.108756951560105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5080 }, { "completion_length": 592.0, "epoch": 1.408259423503326, "grad_norm": 0.0, "kl": 0.16217371821403503, "learning_rate": 4.1084218220262175e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5081 }, { "completion_length": 663.5, "epoch": 1.4085365853658536, "grad_norm": 0.0, "kl": 0.1485685259103775, "learning_rate": 4.108086643168724e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5082 }, { "completion_length": 763.0, "epoch": 1.4088137472283813, "grad_norm": 0.0, "kl": 0.6094321012496948, "learning_rate": 4.107751414997903e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5083 }, { "completion_length": 635.0, "epoch": 1.4090909090909092, "grad_norm": 0.0, "kl": 0.25176095962524414, "learning_rate": 4.107416137524033e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5084 }, { "completion_length": 611.25, "epoch": 1.4093680709534369, "grad_norm": 0.0, "kl": 0.24892467260360718, "learning_rate": 4.107080810757398e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5085 }, { "completion_length": 658.75, "epoch": 1.4096452328159645, "grad_norm": 0.44212275743484497, "kl": 725.1699829101562, "learning_rate": 4.106745434708279e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5086 }, { "completion_length": 596.25, "epoch": 1.4099223946784922, "grad_norm": 0.4379618167877197, "kl": 20.89103889465332, "learning_rate": 4.1064100093869615e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5087 }, { "completion_length": 705.75, "epoch": 1.41019955654102, "grad_norm": 0.0, "kl": 0.16957160830497742, "learning_rate": 4.106074534803732e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5088 }, { "completion_length": 769.25, "epoch": 1.4104767184035476, "grad_norm": 0.0, "kl": 0.150491863489151, "learning_rate": 4.105739010968878e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5089 }, { "completion_length": 628.25, "epoch": 1.4107538802660753, "grad_norm": 0.0, "kl": 0.37166017293930054, "learning_rate": 4.105403437892687e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5090 }, { "completion_length": 660.0, "epoch": 1.4110310421286032, "grad_norm": 0.5063303709030151, "kl": 0.2528875768184662, "learning_rate": 4.105067815585452e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5091 }, { "completion_length": 654.5, "epoch": 1.4113082039911309, "grad_norm": 0.6684159636497498, "kl": 0.17598311603069305, "learning_rate": 4.104732144057463e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5092 }, { "completion_length": 726.75, "epoch": 1.4115853658536586, "grad_norm": 0.0, "kl": 0.2288568913936615, "learning_rate": 4.104396423319015e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5093 }, { "completion_length": 580.25, "epoch": 1.4118625277161863, "grad_norm": 0.0, "kl": 0.3707232177257538, "learning_rate": 4.104060653380403e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5094 }, { "completion_length": 625.0, "epoch": 1.412139689578714, "grad_norm": 0.0, "kl": 0.14814715087413788, "learning_rate": 4.103724834251923e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5095 }, { "completion_length": 732.5, "epoch": 1.4124168514412416, "grad_norm": 0.0, "kl": 0.22576774656772614, "learning_rate": 4.103388965943874e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5096 }, { "completion_length": 643.5, "epoch": 1.4126940133037693, "grad_norm": 0.0, "kl": 0.3162773549556732, "learning_rate": 4.1030530484665564e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5097 }, { "completion_length": 594.5, "epoch": 1.4129711751662972, "grad_norm": 0.3986743092536926, "kl": 0.17447160184383392, "learning_rate": 4.1027170818302695e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5098 }, { "completion_length": 534.75, "epoch": 1.413248337028825, "grad_norm": 1.306956171989441, "kl": 0.19159890711307526, "learning_rate": 4.102381066045316e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5099 }, { "completion_length": 666.5, "epoch": 1.4135254988913526, "grad_norm": 0.0, "kl": 31.687744140625, "learning_rate": 4.102045001122002e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5100 }, { "completion_length": 528.25, "epoch": 1.4138026607538803, "grad_norm": 0.43967658281326294, "kl": 0.26435261964797974, "learning_rate": 4.101708887070632e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5101 }, { "completion_length": 554.0, "epoch": 1.414079822616408, "grad_norm": 0.0, "kl": 0.17219066619873047, "learning_rate": 4.101372723901513e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5102 }, { "completion_length": 630.25, "epoch": 1.4143569844789357, "grad_norm": 0.0, "kl": 0.17423106729984283, "learning_rate": 4.101036511624955e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5103 }, { "completion_length": 621.5, "epoch": 1.4146341463414633, "grad_norm": 0.5425935387611389, "kl": 16.355743408203125, "learning_rate": 4.100700250251266e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5104 }, { "completion_length": 597.75, "epoch": 1.4149113082039912, "grad_norm": 0.0, "kl": 0.15784481167793274, "learning_rate": 4.10036393979076e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5105 }, { "completion_length": 628.0, "epoch": 1.4151884700665187, "grad_norm": 0.0, "kl": 0.16996000707149506, "learning_rate": 4.1000275802537484e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5106 }, { "completion_length": 568.75, "epoch": 1.4154656319290466, "grad_norm": 0.0, "kl": 0.18402615189552307, "learning_rate": 4.099691171650547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5107 }, { "completion_length": 648.75, "epoch": 1.4157427937915743, "grad_norm": 0.0, "kl": 0.17495210468769073, "learning_rate": 4.099354713991473e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5108 }, { "completion_length": 576.25, "epoch": 1.416019955654102, "grad_norm": 0.0, "kl": 0.21233730018138885, "learning_rate": 4.099018207286842e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5109 }, { "completion_length": 583.25, "epoch": 1.4162971175166297, "grad_norm": 0.0, "kl": 0.31894445419311523, "learning_rate": 4.098681651546975e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5110 }, { "completion_length": 570.5, "epoch": 1.4165742793791574, "grad_norm": 0.0, "kl": 0.17096242308616638, "learning_rate": 4.098345046782192e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5111 }, { "completion_length": 609.25, "epoch": 1.4168514412416853, "grad_norm": 0.0, "kl": 0.1625632643699646, "learning_rate": 4.098008393002816e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5112 }, { "completion_length": 584.0, "epoch": 1.4171286031042127, "grad_norm": 0.0, "kl": 0.2709343433380127, "learning_rate": 4.097671690219169e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5113 }, { "completion_length": 572.5, "epoch": 1.4174057649667406, "grad_norm": 0.0, "kl": 0.5437230467796326, "learning_rate": 4.097334938441577e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5114 }, { "completion_length": 603.75, "epoch": 1.4176829268292683, "grad_norm": 0.0, "kl": 0.156784787774086, "learning_rate": 4.0969981376803684e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5115 }, { "completion_length": 639.0, "epoch": 1.417960088691796, "grad_norm": 0.0, "kl": 0.14398904144763947, "learning_rate": 4.09666128794587e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5116 }, { "completion_length": 668.0, "epoch": 1.4182372505543237, "grad_norm": 0.0, "kl": 0.1910693198442459, "learning_rate": 4.096324389248411e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5117 }, { "completion_length": 582.75, "epoch": 1.4185144124168514, "grad_norm": 0.0, "kl": 0.300670862197876, "learning_rate": 4.095987441598325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5118 }, { "completion_length": 704.0, "epoch": 1.418791574279379, "grad_norm": 0.0, "kl": 0.19348444044589996, "learning_rate": 4.095650445005943e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5119 }, { "completion_length": 604.5, "epoch": 1.4190687361419068, "grad_norm": 0.0, "kl": 0.2886962592601776, "learning_rate": 4.095313399481599e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5120 }, { "completion_length": 609.0, "epoch": 1.4193458980044347, "grad_norm": 0.4231072664260864, "kl": 4899.283203125, "learning_rate": 4.094976305035629e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5121 }, { "completion_length": 653.25, "epoch": 1.4196230598669624, "grad_norm": 0.0, "kl": 0.26792243123054504, "learning_rate": 4.0946391616783715e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5122 }, { "completion_length": 561.25, "epoch": 1.41990022172949, "grad_norm": 0.8615424036979675, "kl": 0.6465981006622314, "learning_rate": 4.0943019694201645e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5123 }, { "completion_length": 562.5, "epoch": 1.4201773835920177, "grad_norm": 0.0, "kl": 0.1875762939453125, "learning_rate": 4.093964728271347e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5124 }, { "completion_length": 771.75, "epoch": 1.4204545454545454, "grad_norm": 0.0, "kl": 0.1511896550655365, "learning_rate": 4.093627438242264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5125 }, { "completion_length": 598.0, "epoch": 1.420731707317073, "grad_norm": 0.0, "kl": 0.18343719840049744, "learning_rate": 4.093290099343255e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5126 }, { "completion_length": 589.0, "epoch": 1.4210088691796008, "grad_norm": 0.0, "kl": 0.17490091919898987, "learning_rate": 4.0929527115846676e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5127 }, { "completion_length": 637.5, "epoch": 1.4212860310421287, "grad_norm": 0.0, "kl": 0.1779276430606842, "learning_rate": 4.092615274976846e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5128 }, { "completion_length": 660.0, "epoch": 1.4215631929046564, "grad_norm": NaN, "kl": 0.3726280629634857, "learning_rate": 4.0922777895301405e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5129 }, { "completion_length": 654.5, "epoch": 1.421840354767184, "grad_norm": 0.0, "kl": 0.1560097187757492, "learning_rate": 4.0922777895301405e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5130 }, { "completion_length": 651.25, "epoch": 1.4221175166297118, "grad_norm": 0.0, "kl": 0.13558806478977203, "learning_rate": 4.091940255254897e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5131 }, { "completion_length": 644.0, "epoch": 1.4223946784922394, "grad_norm": 0.0, "kl": 0.15728938579559326, "learning_rate": 4.09160267216147e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5132 }, { "completion_length": 602.0, "epoch": 1.4226718403547671, "grad_norm": 0.0, "kl": 0.16502997279167175, "learning_rate": 4.09126504026021e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5133 }, { "completion_length": 653.5, "epoch": 1.4229490022172948, "grad_norm": 0.0, "kl": 5.6298441886901855, "learning_rate": 4.090927359561469e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5134 }, { "completion_length": 633.5, "epoch": 1.4232261640798227, "grad_norm": 0.0, "kl": 0.24014103412628174, "learning_rate": 4.0905896300756055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5135 }, { "completion_length": 652.5, "epoch": 1.4235033259423504, "grad_norm": 0.6018653512001038, "kl": 0.5287405252456665, "learning_rate": 4.090251851812974e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5136 }, { "completion_length": 607.25, "epoch": 1.423780487804878, "grad_norm": 0.0, "kl": 0.17905156314373016, "learning_rate": 4.089914024783933e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5137 }, { "completion_length": 671.25, "epoch": 1.4240576496674058, "grad_norm": 0.4318655729293823, "kl": 21.521411895751953, "learning_rate": 4.089576148998844e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5138 }, { "completion_length": 589.75, "epoch": 1.4243348115299335, "grad_norm": 0.0, "kl": 0.1697012186050415, "learning_rate": 4.0892382244680665e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5139 }, { "completion_length": 668.0, "epoch": 1.4246119733924612, "grad_norm": 0.0, "kl": 0.18624748289585114, "learning_rate": 4.088900251201964e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5140 }, { "completion_length": 669.0, "epoch": 1.4248891352549888, "grad_norm": 0.0, "kl": 0.17452026903629303, "learning_rate": 4.0885622292108996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5141 }, { "completion_length": 632.75, "epoch": 1.4251662971175167, "grad_norm": 0.0, "kl": 0.3015785217285156, "learning_rate": 4.088224158505241e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5142 }, { "completion_length": 558.75, "epoch": 1.4254434589800442, "grad_norm": 0.0, "kl": 0.418633371591568, "learning_rate": 4.087886039095353e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5143 }, { "completion_length": 613.75, "epoch": 1.4257206208425721, "grad_norm": 0.0, "kl": 0.16786369681358337, "learning_rate": 4.087547870991606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5144 }, { "completion_length": 580.0, "epoch": 1.4259977827050998, "grad_norm": 0.0, "kl": 0.20119154453277588, "learning_rate": 4.0872096542043705e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5145 }, { "completion_length": 587.25, "epoch": 1.4262749445676275, "grad_norm": 0.0, "kl": 0.20177052915096283, "learning_rate": 4.086871388744017e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5146 }, { "completion_length": 554.0, "epoch": 1.4265521064301552, "grad_norm": 0.0, "kl": 0.18128077685832977, "learning_rate": 4.086533074620919e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5147 }, { "completion_length": 657.75, "epoch": 1.4268292682926829, "grad_norm": 0.0, "kl": 0.18596014380455017, "learning_rate": 4.086194711845452e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5148 }, { "completion_length": 769.0, "epoch": 1.4271064301552108, "grad_norm": 0.0, "kl": 0.20332826673984528, "learning_rate": 4.085856300427992e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5149 }, { "completion_length": 552.75, "epoch": 1.4273835920177382, "grad_norm": 0.0, "kl": 0.1704510897397995, "learning_rate": 4.0855178403789155e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5150 }, { "completion_length": 703.0, "epoch": 1.4276607538802661, "grad_norm": 0.5358071327209473, "kl": 1.1787495613098145, "learning_rate": 4.085179331708602e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5151 }, { "completion_length": 685.5, "epoch": 1.4279379157427938, "grad_norm": 0.0, "kl": 0.1961745172739029, "learning_rate": 4.084840774427433e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5152 }, { "completion_length": 629.25, "epoch": 1.4282150776053215, "grad_norm": 0.6198126673698425, "kl": 1967.5748291015625, "learning_rate": 4.084502168545791e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5153 }, { "completion_length": 623.75, "epoch": 1.4284922394678492, "grad_norm": 0.440902441740036, "kl": 1137.0078125, "learning_rate": 4.084163514074058e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5154 }, { "completion_length": 557.75, "epoch": 1.4287694013303769, "grad_norm": 0.0, "kl": 0.19302402436733246, "learning_rate": 4.08382481102262e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5155 }, { "completion_length": 602.0, "epoch": 1.4290465631929046, "grad_norm": 0.0, "kl": 0.17384333908557892, "learning_rate": 4.083486059401864e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5156 }, { "completion_length": 770.5, "epoch": 1.4293237250554323, "grad_norm": 0.0, "kl": 0.17797140777111053, "learning_rate": 4.083147259222178e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5157 }, { "completion_length": 656.5, "epoch": 1.4296008869179602, "grad_norm": 0.0, "kl": 0.20187543332576752, "learning_rate": 4.0828084104939506e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5158 }, { "completion_length": 645.75, "epoch": 1.4298780487804879, "grad_norm": 0.0, "kl": 0.16851480305194855, "learning_rate": 4.082469513227573e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5159 }, { "completion_length": 682.75, "epoch": 1.4301552106430155, "grad_norm": 0.5221818685531616, "kl": 0.5551087260246277, "learning_rate": 4.082130567433439e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5160 }, { "completion_length": 653.0, "epoch": 1.4304323725055432, "grad_norm": 0.0, "kl": 0.2529441714286804, "learning_rate": 4.0817915731219425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5161 }, { "completion_length": 674.25, "epoch": 1.430709534368071, "grad_norm": 0.422962486743927, "kl": 1050.74072265625, "learning_rate": 4.081452530303478e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5162 }, { "completion_length": 619.25, "epoch": 1.4309866962305986, "grad_norm": 0.0, "kl": 0.14969177544116974, "learning_rate": 4.081113438988443e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5163 }, { "completion_length": 654.0, "epoch": 1.4312638580931263, "grad_norm": 0.0, "kl": 0.17960573732852936, "learning_rate": 4.080774299187237e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5164 }, { "completion_length": 646.0, "epoch": 1.4315410199556542, "grad_norm": 0.0, "kl": 0.1754632145166397, "learning_rate": 4.080435110910258e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5165 }, { "completion_length": 620.25, "epoch": 1.4318181818181819, "grad_norm": 0.0, "kl": 0.1745135635137558, "learning_rate": 4.08009587416791e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5166 }, { "completion_length": 626.0, "epoch": 1.4320953436807096, "grad_norm": 0.0, "kl": 0.25023967027664185, "learning_rate": 4.079756588970593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5167 }, { "completion_length": 716.75, "epoch": 1.4323725055432373, "grad_norm": 0.0, "kl": 118.94284057617188, "learning_rate": 4.079417255328714e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5168 }, { "completion_length": 581.5, "epoch": 1.432649667405765, "grad_norm": 0.0, "kl": 0.3240221440792084, "learning_rate": 4.079077873252677e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5169 }, { "completion_length": 608.5, "epoch": 1.4329268292682926, "grad_norm": 0.0, "kl": 0.3449830412864685, "learning_rate": 4.078738442752891e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5170 }, { "completion_length": 648.0, "epoch": 1.4332039911308203, "grad_norm": 0.0, "kl": 0.1901497095823288, "learning_rate": 4.078398963839765e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5171 }, { "completion_length": 674.25, "epoch": 1.4334811529933482, "grad_norm": 0.4075292944908142, "kl": 32.07170104980469, "learning_rate": 4.0780594365237075e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5172 }, { "completion_length": 542.75, "epoch": 1.433758314855876, "grad_norm": 0.0, "kl": 0.2930629849433899, "learning_rate": 4.077719860815132e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5173 }, { "completion_length": 623.5, "epoch": 1.4340354767184036, "grad_norm": 0.5590749979019165, "kl": 0.2341911643743515, "learning_rate": 4.077380236724452e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5174 }, { "completion_length": 657.0, "epoch": 1.4343126385809313, "grad_norm": 0.0, "kl": 0.2189110815525055, "learning_rate": 4.077040564262082e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5175 }, { "completion_length": 592.5, "epoch": 1.434589800443459, "grad_norm": 0.0, "kl": 0.20336800813674927, "learning_rate": 4.076700843438438e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5176 }, { "completion_length": 669.25, "epoch": 1.4348669623059866, "grad_norm": 0.4248564541339874, "kl": 28.445886611938477, "learning_rate": 4.076361074263938e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5177 }, { "completion_length": 590.75, "epoch": 1.4351441241685143, "grad_norm": 0.0, "kl": 0.22268620133399963, "learning_rate": 4.076021256749001e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5178 }, { "completion_length": 530.25, "epoch": 1.4354212860310422, "grad_norm": 0.0, "kl": 0.15356455743312836, "learning_rate": 4.075681390904048e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5179 }, { "completion_length": 600.25, "epoch": 1.4356984478935697, "grad_norm": 0.0, "kl": 0.23303534090518951, "learning_rate": 4.075341476739503e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5180 }, { "completion_length": 577.75, "epoch": 1.4359756097560976, "grad_norm": 0.7573637962341309, "kl": 28.85895538330078, "learning_rate": 4.075001514265786e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5181 }, { "completion_length": 572.5, "epoch": 1.4362527716186253, "grad_norm": 0.0, "kl": 0.25389912724494934, "learning_rate": 4.0746615034933255e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5182 }, { "completion_length": 582.0, "epoch": 1.436529933481153, "grad_norm": 0.0, "kl": 0.1946174055337906, "learning_rate": 4.074321444432547e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5183 }, { "completion_length": 576.25, "epoch": 1.4368070953436807, "grad_norm": 0.0, "kl": 0.21670690178871155, "learning_rate": 4.073981337093879e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5184 }, { "completion_length": 674.0, "epoch": 1.4370842572062084, "grad_norm": 0.0, "kl": 1.6941980123519897, "learning_rate": 4.073641181487751e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5185 }, { "completion_length": 568.0, "epoch": 1.4373614190687363, "grad_norm": 0.0, "kl": 0.23568607866764069, "learning_rate": 4.073300977624594e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5186 }, { "completion_length": 610.0, "epoch": 1.4376385809312637, "grad_norm": 0.0, "kl": 0.23763048648834229, "learning_rate": 4.0729607255148405e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5187 }, { "completion_length": 592.0, "epoch": 1.4379157427937916, "grad_norm": 0.0, "kl": 0.16849854588508606, "learning_rate": 4.072620425168925e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5188 }, { "completion_length": 597.0, "epoch": 1.4381929046563193, "grad_norm": 0.0, "kl": 0.22880001366138458, "learning_rate": 4.072280076597284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5189 }, { "completion_length": 638.0, "epoch": 1.438470066518847, "grad_norm": 0.506734311580658, "kl": 0.2882823050022125, "learning_rate": 4.071939679810353e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5190 }, { "completion_length": 577.0, "epoch": 1.4387472283813747, "grad_norm": 0.0, "kl": 0.2379976212978363, "learning_rate": 4.07159923481857e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5191 }, { "completion_length": 678.75, "epoch": 1.4390243902439024, "grad_norm": 0.0, "kl": 0.1378423571586609, "learning_rate": 4.0712587416323775e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5192 }, { "completion_length": 577.75, "epoch": 1.4393015521064303, "grad_norm": 0.0, "kl": 0.33832767605781555, "learning_rate": 4.070918200262215e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5193 }, { "completion_length": 635.75, "epoch": 1.4395787139689578, "grad_norm": 0.0, "kl": 0.1869177222251892, "learning_rate": 4.0705776107185265e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5194 }, { "completion_length": 467.25, "epoch": 1.4398558758314857, "grad_norm": 0.0, "kl": 2.9028520584106445, "learning_rate": 4.070236973011757e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5195 }, { "completion_length": 672.75, "epoch": 1.4401330376940134, "grad_norm": 0.0, "kl": 0.14651094377040863, "learning_rate": 4.0698962871523494e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5196 }, { "completion_length": 535.0, "epoch": 1.440410199556541, "grad_norm": 0.0, "kl": 0.19909946620464325, "learning_rate": 4.069555553150755e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5197 }, { "completion_length": 628.0, "epoch": 1.4406873614190687, "grad_norm": 0.0, "kl": 0.2182197868824005, "learning_rate": 4.06921477101742e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5198 }, { "completion_length": 552.25, "epoch": 1.4409645232815964, "grad_norm": 0.0, "kl": 0.200071319937706, "learning_rate": 4.068873940762796e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5199 }, { "completion_length": 631.0, "epoch": 1.441241685144124, "grad_norm": 0.0, "kl": 0.20999406278133392, "learning_rate": 4.0685330623973355e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5200 }, { "completion_length": 675.75, "epoch": 1.4415188470066518, "grad_norm": 0.45514896512031555, "kl": 0.24776025116443634, "learning_rate": 4.06819213593149e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5201 }, { "completion_length": 573.25, "epoch": 1.4417960088691797, "grad_norm": 1.0261480808258057, "kl": 0.5292895436286926, "learning_rate": 4.067851161375715e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5202 }, { "completion_length": 570.75, "epoch": 1.4420731707317074, "grad_norm": 0.0, "kl": 0.18383951485157013, "learning_rate": 4.067510138740467e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5203 }, { "completion_length": 717.25, "epoch": 1.442350332594235, "grad_norm": 0.0, "kl": 0.16426768898963928, "learning_rate": 4.0671690680362045e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5204 }, { "completion_length": 626.75, "epoch": 1.4426274944567627, "grad_norm": 0.0, "kl": 0.20599797368049622, "learning_rate": 4.066827949273385e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5205 }, { "completion_length": 532.0, "epoch": 1.4429046563192904, "grad_norm": 0.0, "kl": 0.2126603126525879, "learning_rate": 4.066486782462471e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5206 }, { "completion_length": 597.5, "epoch": 1.4431818181818181, "grad_norm": 0.0, "kl": 0.1987384706735611, "learning_rate": 4.066145567613922e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5207 }, { "completion_length": 598.0, "epoch": 1.4434589800443458, "grad_norm": 0.0, "kl": 0.26340049505233765, "learning_rate": 4.065804304738206e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5208 }, { "completion_length": 630.0, "epoch": 1.4437361419068737, "grad_norm": 0.0, "kl": 0.229638010263443, "learning_rate": 4.065462993845785e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5209 }, { "completion_length": 531.25, "epoch": 1.4440133037694014, "grad_norm": 0.0, "kl": 0.3007006049156189, "learning_rate": 4.065121634947126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5210 }, { "completion_length": 569.0, "epoch": 1.444290465631929, "grad_norm": 0.0, "kl": 0.14279207587242126, "learning_rate": 4.064780228052696e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5211 }, { "completion_length": 603.5, "epoch": 1.4445676274944568, "grad_norm": 0.0, "kl": 0.21041437983512878, "learning_rate": 4.064438773172966e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5212 }, { "completion_length": 617.75, "epoch": 1.4448447893569845, "grad_norm": 0.0, "kl": 0.20257468521595, "learning_rate": 4.0640972703184075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5213 }, { "completion_length": 643.75, "epoch": 1.4451219512195121, "grad_norm": 0.0, "kl": 0.2060060054063797, "learning_rate": 4.063755719499493e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5214 }, { "completion_length": 684.5, "epoch": 1.4453991130820398, "grad_norm": 0.0, "kl": 0.2954198718070984, "learning_rate": 4.0634141207266945e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5215 }, { "completion_length": 711.75, "epoch": 1.4456762749445677, "grad_norm": 0.38711100816726685, "kl": 0.3884144723415375, "learning_rate": 4.063072474010488e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5216 }, { "completion_length": 599.75, "epoch": 1.4459534368070954, "grad_norm": 0.0, "kl": 0.34331050515174866, "learning_rate": 4.062730779361352e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5217 }, { "completion_length": 593.25, "epoch": 1.4462305986696231, "grad_norm": 0.0, "kl": 0.7283703684806824, "learning_rate": 4.062389036789763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5218 }, { "completion_length": 622.5, "epoch": 1.4465077605321508, "grad_norm": 0.0, "kl": 0.18963493406772614, "learning_rate": 4.062047246306202e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5219 }, { "completion_length": 671.25, "epoch": 1.4467849223946785, "grad_norm": 0.0, "kl": 0.21875527501106262, "learning_rate": 4.06170540792115e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5220 }, { "completion_length": 588.25, "epoch": 1.4470620842572062, "grad_norm": 0.0, "kl": 0.17191259562969208, "learning_rate": 4.06136352164509e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5221 }, { "completion_length": 667.0, "epoch": 1.4473392461197339, "grad_norm": 0.0, "kl": 0.1652405709028244, "learning_rate": 4.061021587488505e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5222 }, { "completion_length": 534.5, "epoch": 1.4476164079822618, "grad_norm": 0.0, "kl": 0.19779078662395477, "learning_rate": 4.060679605461882e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5223 }, { "completion_length": 594.5, "epoch": 1.4478935698447892, "grad_norm": 0.0, "kl": 0.20895449817180634, "learning_rate": 4.060337575575708e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5224 }, { "completion_length": 711.75, "epoch": 1.4481707317073171, "grad_norm": 0.0, "kl": 0.23874130845069885, "learning_rate": 4.059995497840471e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5225 }, { "completion_length": 505.0, "epoch": 1.4484478935698448, "grad_norm": 0.0, "kl": 0.2218710482120514, "learning_rate": 4.059653372266662e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5226 }, { "completion_length": 561.0, "epoch": 1.4487250554323725, "grad_norm": 0.0, "kl": 0.2833687663078308, "learning_rate": 4.059311198864772e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5227 }, { "completion_length": 629.75, "epoch": 1.4490022172949002, "grad_norm": 0.0, "kl": 0.23407964408397675, "learning_rate": 4.0589689776452934e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5228 }, { "completion_length": 768.5, "epoch": 1.4492793791574279, "grad_norm": 0.0, "kl": 0.18475580215454102, "learning_rate": 4.058626708618722e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5229 }, { "completion_length": 636.5, "epoch": 1.4495565410199558, "grad_norm": 0.0, "kl": 0.17299064993858337, "learning_rate": 4.058284391795554e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5230 }, { "completion_length": 595.5, "epoch": 1.4498337028824833, "grad_norm": 0.0, "kl": 0.22767405211925507, "learning_rate": 4.057942027186284e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5231 }, { "completion_length": 678.25, "epoch": 1.4501108647450112, "grad_norm": 0.0, "kl": 0.18663042783737183, "learning_rate": 4.0575996148014156e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5232 }, { "completion_length": 569.25, "epoch": 1.4503880266075388, "grad_norm": 0.0, "kl": 0.17179055511951447, "learning_rate": 4.057257154651444e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5233 }, { "completion_length": 613.75, "epoch": 1.4506651884700665, "grad_norm": 0.0, "kl": 0.20744450390338898, "learning_rate": 4.056914646746875e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5234 }, { "completion_length": 559.75, "epoch": 1.4509423503325942, "grad_norm": 0.5427007079124451, "kl": 12.176844596862793, "learning_rate": 4.056572091098208e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5235 }, { "completion_length": 634.25, "epoch": 1.451219512195122, "grad_norm": 0.0, "kl": 0.13161742687225342, "learning_rate": 4.056229487715953e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5236 }, { "completion_length": 608.0, "epoch": 1.4514966740576496, "grad_norm": 0.0, "kl": 0.15318457782268524, "learning_rate": 4.055886836610612e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5237 }, { "completion_length": 663.0, "epoch": 1.4517738359201773, "grad_norm": 0.0, "kl": 0.14292089641094208, "learning_rate": 4.055544137792695e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5238 }, { "completion_length": 550.75, "epoch": 1.4520509977827052, "grad_norm": 0.0, "kl": 0.19284674525260925, "learning_rate": 4.055201391272709e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5239 }, { "completion_length": 681.0, "epoch": 1.4523281596452329, "grad_norm": 0.0, "kl": 0.17457637190818787, "learning_rate": 4.054858597061166e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5240 }, { "completion_length": 649.0, "epoch": 1.4526053215077606, "grad_norm": 0.0, "kl": 0.21141143143177032, "learning_rate": 4.054515755168579e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5241 }, { "completion_length": 669.75, "epoch": 1.4528824833702882, "grad_norm": 0.0, "kl": 0.2443331629037857, "learning_rate": 4.054172865605459e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5242 }, { "completion_length": 604.25, "epoch": 1.453159645232816, "grad_norm": 0.0, "kl": 0.1633724719285965, "learning_rate": 4.053829928382324e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5243 }, { "completion_length": 761.25, "epoch": 1.4534368070953436, "grad_norm": 0.0, "kl": 0.14460846781730652, "learning_rate": 4.053486943509688e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5244 }, { "completion_length": 628.25, "epoch": 1.4537139689578713, "grad_norm": 0.5902760028839111, "kl": 0.2504439353942871, "learning_rate": 4.0531439109980695e-06, "loss": 0.0, "reward": 1.6875, "reward_std": 0.125, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 5245 }, { "completion_length": 713.25, "epoch": 1.4539911308203992, "grad_norm": 0.0, "kl": 0.16483290493488312, "learning_rate": 4.052800830857988e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5246 }, { "completion_length": 523.5, "epoch": 1.454268292682927, "grad_norm": 0.0, "kl": 0.18034732341766357, "learning_rate": 4.052457703099965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5247 }, { "completion_length": 483.75, "epoch": 1.4545454545454546, "grad_norm": 0.5059813857078552, "kl": 985.7698974609375, "learning_rate": 4.052114527734522e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5248 }, { "completion_length": 676.5, "epoch": 1.4548226164079823, "grad_norm": 0.0, "kl": 0.21351569890975952, "learning_rate": 4.051771304772183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5249 }, { "completion_length": 582.75, "epoch": 1.45509977827051, "grad_norm": 0.5306254625320435, "kl": 0.7817928194999695, "learning_rate": 4.051428034223473e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5250 }, { "completion_length": 645.25, "epoch": 1.4553769401330376, "grad_norm": 0.0, "kl": 0.18015867471694946, "learning_rate": 4.051084716098921e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5251 }, { "completion_length": 665.0, "epoch": 1.4556541019955653, "grad_norm": 0.4800225496292114, "kl": 206.2001190185547, "learning_rate": 4.050741350409051e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5252 }, { "completion_length": 669.0, "epoch": 1.4559312638580932, "grad_norm": 0.0, "kl": 0.1463271677494049, "learning_rate": 4.050397937164395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5253 }, { "completion_length": 578.0, "epoch": 1.456208425720621, "grad_norm": 0.0, "kl": 0.21275624632835388, "learning_rate": 4.050054476375485e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5254 }, { "completion_length": 581.5, "epoch": 1.4564855875831486, "grad_norm": 0.0, "kl": 0.2772890627384186, "learning_rate": 4.049710968052851e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5255 }, { "completion_length": 677.25, "epoch": 1.4567627494456763, "grad_norm": 0.6551278233528137, "kl": 11.303743362426758, "learning_rate": 4.049367412207028e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5256 }, { "completion_length": 656.25, "epoch": 1.457039911308204, "grad_norm": 0.0, "kl": 0.17100889980793, "learning_rate": 4.049023808848552e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5257 }, { "completion_length": 682.25, "epoch": 1.4573170731707317, "grad_norm": 0.0, "kl": 0.19025918841362, "learning_rate": 4.048680157987961e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5258 }, { "completion_length": 627.0, "epoch": 1.4575942350332594, "grad_norm": 0.0, "kl": 0.18838781118392944, "learning_rate": 4.048336459635791e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5259 }, { "completion_length": 694.25, "epoch": 1.4578713968957873, "grad_norm": 0.0, "kl": 6.161405563354492, "learning_rate": 4.047992713802582e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5260 }, { "completion_length": 612.0, "epoch": 1.4581485587583147, "grad_norm": 0.0, "kl": 10493.6943359375, "learning_rate": 4.047648920498877e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5261 }, { "completion_length": 666.0, "epoch": 1.4584257206208426, "grad_norm": 0.0, "kl": 0.2673952579498291, "learning_rate": 4.047305079735217e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5262 }, { "completion_length": 803.0, "epoch": 1.4587028824833703, "grad_norm": 0.0, "kl": 0.16756626963615417, "learning_rate": 4.046961191522147e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5263 }, { "completion_length": 597.5, "epoch": 1.458980044345898, "grad_norm": 0.0, "kl": 0.3398585915565491, "learning_rate": 4.046617255870212e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5264 }, { "completion_length": 550.25, "epoch": 1.4592572062084257, "grad_norm": 0.0, "kl": 0.3861757218837738, "learning_rate": 4.04627327278996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5265 }, { "completion_length": 518.75, "epoch": 1.4595343680709534, "grad_norm": 0.0, "kl": 0.2612280249595642, "learning_rate": 4.045929242291939e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5266 }, { "completion_length": 654.5, "epoch": 1.4598115299334813, "grad_norm": 0.0, "kl": 1.2684727907180786, "learning_rate": 4.045585164386699e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5267 }, { "completion_length": 637.75, "epoch": 1.4600886917960088, "grad_norm": 0.0, "kl": 0.1402052640914917, "learning_rate": 4.0452410390847915e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5268 }, { "completion_length": 577.75, "epoch": 1.4603658536585367, "grad_norm": 0.41485708951950073, "kl": 183.5689697265625, "learning_rate": 4.044896866396769e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5269 }, { "completion_length": 522.0, "epoch": 1.4606430155210643, "grad_norm": 0.0, "kl": 0.2980189919471741, "learning_rate": 4.044552646333187e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5270 }, { "completion_length": 590.0, "epoch": 1.460920177383592, "grad_norm": 0.0, "kl": 0.18936507403850555, "learning_rate": 4.0442083789046e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5271 }, { "completion_length": 586.5, "epoch": 1.4611973392461197, "grad_norm": 0.6313812732696533, "kl": 1778.5572509765625, "learning_rate": 4.043864064121566e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5272 }, { "completion_length": 556.5, "epoch": 1.4614745011086474, "grad_norm": 0.0, "kl": 0.15563662350177765, "learning_rate": 4.0435197019946425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5273 }, { "completion_length": 505.75, "epoch": 1.461751662971175, "grad_norm": 0.0, "kl": 0.24510763585567474, "learning_rate": 4.043175292534392e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5274 }, { "completion_length": 672.75, "epoch": 1.4620288248337028, "grad_norm": 0.0, "kl": 0.17211924493312836, "learning_rate": 4.042830835751375e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5275 }, { "completion_length": 588.25, "epoch": 1.4623059866962307, "grad_norm": 0.7613090872764587, "kl": 9.952601432800293, "learning_rate": 4.042486331656153e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5276 }, { "completion_length": 608.0, "epoch": 1.4625831485587584, "grad_norm": 0.0, "kl": 0.1831621676683426, "learning_rate": 4.042141780259292e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5277 }, { "completion_length": 601.5, "epoch": 1.462860310421286, "grad_norm": 0.0, "kl": 0.16878841817378998, "learning_rate": 4.041797181571358e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5278 }, { "completion_length": 581.25, "epoch": 1.4631374722838137, "grad_norm": 0.0, "kl": 0.19443242251873016, "learning_rate": 4.041452535602918e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5279 }, { "completion_length": 578.5, "epoch": 1.4634146341463414, "grad_norm": 0.0, "kl": 0.17410513758659363, "learning_rate": 4.041107842364541e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5280 }, { "completion_length": 508.25, "epoch": 1.4636917960088691, "grad_norm": 0.0, "kl": 0.5755799412727356, "learning_rate": 4.040763101866798e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5281 }, { "completion_length": 569.25, "epoch": 1.4639689578713968, "grad_norm": 0.0, "kl": 0.1889384388923645, "learning_rate": 4.04041831412026e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5282 }, { "completion_length": 561.25, "epoch": 1.4642461197339247, "grad_norm": 0.0, "kl": 0.1724434494972229, "learning_rate": 4.0400734791354996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5283 }, { "completion_length": 536.0, "epoch": 1.4645232815964524, "grad_norm": 0.0, "kl": 0.23179081082344055, "learning_rate": 4.039728596923093e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5284 }, { "completion_length": 559.25, "epoch": 1.46480044345898, "grad_norm": 0.0, "kl": 0.3678642511367798, "learning_rate": 4.039383667493614e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5285 }, { "completion_length": 551.75, "epoch": 1.4650776053215078, "grad_norm": 0.0, "kl": 0.21394595503807068, "learning_rate": 4.039038690857643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5286 }, { "completion_length": 715.0, "epoch": 1.4653547671840355, "grad_norm": 0.0, "kl": 0.14923742413520813, "learning_rate": 4.038693667025757e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5287 }, { "completion_length": 608.75, "epoch": 1.4656319290465631, "grad_norm": 0.0, "kl": 0.18449315428733826, "learning_rate": 4.038348596008537e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5288 }, { "completion_length": 523.25, "epoch": 1.4659090909090908, "grad_norm": 0.0, "kl": 0.19559457898139954, "learning_rate": 4.038003477816564e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5289 }, { "completion_length": 542.5, "epoch": 1.4661862527716187, "grad_norm": 0.0, "kl": 0.6606143712997437, "learning_rate": 4.037658312460424e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5290 }, { "completion_length": 625.5, "epoch": 1.4664634146341464, "grad_norm": 0.0, "kl": 0.15128150582313538, "learning_rate": 4.037313099950699e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5291 }, { "completion_length": 555.25, "epoch": 1.466740576496674, "grad_norm": 0.0, "kl": 0.19924505054950714, "learning_rate": 4.036967840297978e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5292 }, { "completion_length": 581.5, "epoch": 1.4670177383592018, "grad_norm": 0.0, "kl": 8697.703125, "learning_rate": 4.036622533512845e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5293 }, { "completion_length": 629.75, "epoch": 1.4672949002217295, "grad_norm": 0.0, "kl": 0.30492186546325684, "learning_rate": 4.036277179605891e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5294 }, { "completion_length": 535.75, "epoch": 1.4675720620842572, "grad_norm": 0.0, "kl": 0.24097108840942383, "learning_rate": 4.0359317785877075e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5295 }, { "completion_length": 552.0, "epoch": 1.4678492239467849, "grad_norm": 0.0, "kl": 0.17264388501644135, "learning_rate": 4.035586330468886e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5296 }, { "completion_length": 583.25, "epoch": 1.4681263858093128, "grad_norm": 0.6603940725326538, "kl": 101771.1171875, "learning_rate": 4.0352408352600196e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5297 }, { "completion_length": 630.75, "epoch": 1.4684035476718402, "grad_norm": 0.0, "kl": 0.2380499690771103, "learning_rate": 4.034895292971702e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5298 }, { "completion_length": 596.5, "epoch": 1.4686807095343681, "grad_norm": 0.43365079164505005, "kl": 0.16339465975761414, "learning_rate": 4.034549703614532e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5299 }, { "completion_length": 539.5, "epoch": 1.4689578713968958, "grad_norm": 0.0, "kl": 0.1841920018196106, "learning_rate": 4.034204067199106e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5300 }, { "completion_length": 544.25, "epoch": 1.4692350332594235, "grad_norm": 0.0, "kl": 0.2142963409423828, "learning_rate": 4.0338583837360225e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5301 }, { "completion_length": 549.75, "epoch": 1.4695121951219512, "grad_norm": 0.0, "kl": 2.113142490386963, "learning_rate": 4.033512653235884e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5302 }, { "completion_length": 595.0, "epoch": 1.4697893569844789, "grad_norm": 0.0, "kl": 0.21021904051303864, "learning_rate": 4.033166875709291e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5303 }, { "completion_length": 561.5, "epoch": 1.4700665188470068, "grad_norm": 0.0, "kl": 0.1841791272163391, "learning_rate": 4.032821051166848e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5304 }, { "completion_length": 632.75, "epoch": 1.4703436807095343, "grad_norm": 0.0, "kl": 0.1592087596654892, "learning_rate": 4.03247517961916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5305 }, { "completion_length": 652.75, "epoch": 1.4706208425720622, "grad_norm": 0.0, "kl": 0.17296400666236877, "learning_rate": 4.032129261076833e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5306 }, { "completion_length": 752.0, "epoch": 1.4708980044345898, "grad_norm": 0.0, "kl": 0.14627060294151306, "learning_rate": 4.031783295550475e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5307 }, { "completion_length": 537.75, "epoch": 1.4711751662971175, "grad_norm": 0.0, "kl": 0.6509762406349182, "learning_rate": 4.031437283050696e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5308 }, { "completion_length": 558.5, "epoch": 1.4714523281596452, "grad_norm": 0.0, "kl": 0.20643571019172668, "learning_rate": 4.031091223588105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5309 }, { "completion_length": 604.5, "epoch": 1.471729490022173, "grad_norm": 0.3888067901134491, "kl": 22.1326961517334, "learning_rate": 4.030745117173316e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5310 }, { "completion_length": 676.5, "epoch": 1.4720066518847006, "grad_norm": 0.0, "kl": 0.17835456132888794, "learning_rate": 4.030398963816941e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5311 }, { "completion_length": 710.25, "epoch": 1.4722838137472283, "grad_norm": 0.0, "kl": 0.1758846938610077, "learning_rate": 4.0300527635295985e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5312 }, { "completion_length": 545.25, "epoch": 1.4725609756097562, "grad_norm": 0.0, "kl": 0.17373110353946686, "learning_rate": 4.0297065163219005e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5313 }, { "completion_length": 613.0, "epoch": 1.4728381374722839, "grad_norm": 0.0, "kl": 0.16751091182231903, "learning_rate": 4.0293602222044685e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5314 }, { "completion_length": 591.25, "epoch": 1.4731152993348116, "grad_norm": 0.0, "kl": 0.1528708040714264, "learning_rate": 4.029013881187919e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5315 }, { "completion_length": 658.0, "epoch": 1.4733924611973392, "grad_norm": 0.0, "kl": 0.17536360025405884, "learning_rate": 4.028667493282875e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5316 }, { "completion_length": 547.75, "epoch": 1.473669623059867, "grad_norm": 0.0, "kl": 0.15267963707447052, "learning_rate": 4.028321058499958e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5317 }, { "completion_length": 596.5, "epoch": 1.4739467849223946, "grad_norm": 0.0, "kl": 0.21602332592010498, "learning_rate": 4.027974576849792e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5318 }, { "completion_length": 525.25, "epoch": 1.4742239467849223, "grad_norm": 0.0, "kl": 0.4054949879646301, "learning_rate": 4.027628048343002e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5319 }, { "completion_length": 662.0, "epoch": 1.4745011086474502, "grad_norm": 0.0, "kl": 0.18273970484733582, "learning_rate": 4.027281472990215e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5320 }, { "completion_length": 636.75, "epoch": 1.474778270509978, "grad_norm": 0.0, "kl": 0.18307039141654968, "learning_rate": 4.026934850802057e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5321 }, { "completion_length": 580.0, "epoch": 1.4750554323725056, "grad_norm": 0.0, "kl": 0.6980326771736145, "learning_rate": 4.02658818178916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5322 }, { "completion_length": 678.5, "epoch": 1.4753325942350333, "grad_norm": 0.0, "kl": 0.15324018895626068, "learning_rate": 4.026241465962154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5323 }, { "completion_length": 694.5, "epoch": 1.475609756097561, "grad_norm": 0.0, "kl": 0.23535655438899994, "learning_rate": 4.025894703331671e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5324 }, { "completion_length": 642.25, "epoch": 1.4758869179600886, "grad_norm": 0.0, "kl": 0.17375171184539795, "learning_rate": 4.025547893908344e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5325 }, { "completion_length": 567.0, "epoch": 1.4761640798226163, "grad_norm": 0.0, "kl": 0.2174558788537979, "learning_rate": 4.025201037702811e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5326 }, { "completion_length": 522.25, "epoch": 1.4764412416851442, "grad_norm": 0.0, "kl": 0.1686902791261673, "learning_rate": 4.0248541347257065e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5327 }, { "completion_length": 623.75, "epoch": 1.476718403547672, "grad_norm": 0.0, "kl": 0.21989530324935913, "learning_rate": 4.024507184987668e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5328 }, { "completion_length": 569.0, "epoch": 1.4769955654101996, "grad_norm": 0.0, "kl": 0.177145317196846, "learning_rate": 4.024160188499337e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5329 }, { "completion_length": 588.75, "epoch": 1.4772727272727273, "grad_norm": 0.0, "kl": 0.1991463005542755, "learning_rate": 4.023813145271352e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5330 }, { "completion_length": 522.25, "epoch": 1.477549889135255, "grad_norm": 0.0, "kl": 0.4583966135978699, "learning_rate": 4.0234660553143575e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5331 }, { "completion_length": 551.5, "epoch": 1.4778270509977827, "grad_norm": 0.0, "kl": 42.86201858520508, "learning_rate": 4.023118918638997e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5332 }, { "completion_length": 502.25, "epoch": 1.4781042128603104, "grad_norm": 0.0, "kl": 0.25148874521255493, "learning_rate": 4.022771735255915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5333 }, { "completion_length": 639.5, "epoch": 1.4783813747228383, "grad_norm": 0.0, "kl": 1.1411727666854858, "learning_rate": 4.0224245051757586e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5334 }, { "completion_length": 583.25, "epoch": 1.4786585365853657, "grad_norm": 0.0, "kl": 0.22118213772773743, "learning_rate": 4.022077228409176e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5335 }, { "completion_length": 587.25, "epoch": 1.4789356984478936, "grad_norm": 0.0, "kl": 0.16553641855716705, "learning_rate": 4.021729904966815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5336 }, { "completion_length": 614.5, "epoch": 1.4792128603104213, "grad_norm": 0.0, "kl": 0.19145964086055756, "learning_rate": 4.021382534859329e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5337 }, { "completion_length": 560.75, "epoch": 1.479490022172949, "grad_norm": 0.0, "kl": 0.20115996897220612, "learning_rate": 4.02103511809737e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5338 }, { "completion_length": 615.0, "epoch": 1.4797671840354767, "grad_norm": 0.0, "kl": 0.14651280641555786, "learning_rate": 4.02068765469159e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5339 }, { "completion_length": 580.5, "epoch": 1.4800443458980044, "grad_norm": 0.0, "kl": 0.20452822744846344, "learning_rate": 4.020340144652647e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5340 }, { "completion_length": 610.75, "epoch": 1.4803215077605323, "grad_norm": 0.0, "kl": 0.1776917427778244, "learning_rate": 4.019992587991196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5341 }, { "completion_length": 658.75, "epoch": 1.4805986696230597, "grad_norm": 0.0, "kl": 0.17642204463481903, "learning_rate": 4.0196449847178945e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5342 }, { "completion_length": 640.5, "epoch": 1.4808758314855877, "grad_norm": 0.0, "kl": 0.28057754039764404, "learning_rate": 4.019297334843404e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5343 }, { "completion_length": 658.0, "epoch": 1.4811529933481153, "grad_norm": 0.0, "kl": 0.260613352060318, "learning_rate": 4.018949638378384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5344 }, { "completion_length": 566.5, "epoch": 1.481430155210643, "grad_norm": 0.0, "kl": 57300.64453125, "learning_rate": 4.018601895333496e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5345 }, { "completion_length": 567.0, "epoch": 1.4817073170731707, "grad_norm": 0.0, "kl": 0.165848970413208, "learning_rate": 4.018254105719405e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5346 }, { "completion_length": 618.75, "epoch": 1.4819844789356984, "grad_norm": 0.0, "kl": 0.2373272329568863, "learning_rate": 4.017906269546778e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5347 }, { "completion_length": 643.75, "epoch": 1.482261640798226, "grad_norm": 0.0, "kl": 0.4147825837135315, "learning_rate": 4.0175583868262805e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5348 }, { "completion_length": 565.75, "epoch": 1.4825388026607538, "grad_norm": 0.0, "kl": 0.2487793266773224, "learning_rate": 4.017210457568578e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5349 }, { "completion_length": 585.5, "epoch": 1.4828159645232817, "grad_norm": 0.0, "kl": 0.18191294372081757, "learning_rate": 4.016862481784343e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5350 }, { "completion_length": 575.75, "epoch": 1.4830931263858094, "grad_norm": 0.0, "kl": 0.19657786190509796, "learning_rate": 4.016514459484247e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5351 }, { "completion_length": 688.75, "epoch": 1.483370288248337, "grad_norm": 0.0, "kl": 0.17796701192855835, "learning_rate": 4.01616639067896e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5352 }, { "completion_length": 626.75, "epoch": 1.4836474501108647, "grad_norm": 0.0, "kl": 0.21105287969112396, "learning_rate": 4.0158182753791566e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5353 }, { "completion_length": 619.5, "epoch": 1.4839246119733924, "grad_norm": 0.46325454115867615, "kl": 0.16768604516983032, "learning_rate": 4.015470113595513e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5354 }, { "completion_length": 636.75, "epoch": 1.4842017738359201, "grad_norm": 0.0, "kl": 0.19658046960830688, "learning_rate": 4.015121905338704e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5355 }, { "completion_length": 571.5, "epoch": 1.4844789356984478, "grad_norm": 0.0, "kl": 0.188772514462471, "learning_rate": 4.01477365061941e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5356 }, { "completion_length": 704.25, "epoch": 1.4847560975609757, "grad_norm": 0.0, "kl": 0.19212627410888672, "learning_rate": 4.0144253494483095e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5357 }, { "completion_length": 580.5, "epoch": 1.4850332594235034, "grad_norm": 0.0, "kl": 0.184607595205307, "learning_rate": 4.014077001836083e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5358 }, { "completion_length": 629.0, "epoch": 1.485310421286031, "grad_norm": 0.0, "kl": 0.1618749499320984, "learning_rate": 4.0137286077934135e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5359 }, { "completion_length": 606.25, "epoch": 1.4855875831485588, "grad_norm": 0.0, "kl": 0.22049298882484436, "learning_rate": 4.013380167330985e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5360 }, { "completion_length": 681.5, "epoch": 1.4858647450110865, "grad_norm": 0.0, "kl": 0.17004035413265228, "learning_rate": 4.013031680459481e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5361 }, { "completion_length": 592.25, "epoch": 1.4861419068736141, "grad_norm": 0.0, "kl": 0.20057030022144318, "learning_rate": 4.01268314718959e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5362 }, { "completion_length": 560.25, "epoch": 1.4864190687361418, "grad_norm": 0.0, "kl": 0.17240409553050995, "learning_rate": 4.012334567532e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5363 }, { "completion_length": 520.0, "epoch": 1.4866962305986697, "grad_norm": 0.0, "kl": 0.18508540093898773, "learning_rate": 4.011985941497399e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5364 }, { "completion_length": 648.25, "epoch": 1.4869733924611974, "grad_norm": 0.41072529554367065, "kl": 688.2888793945312, "learning_rate": 4.01163726909648e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5365 }, { "completion_length": 612.25, "epoch": 1.487250554323725, "grad_norm": 0.5035135746002197, "kl": 0.16288301348686218, "learning_rate": 4.011288550339933e-06, "loss": -0.0, "reward": 5.71875, "reward_std": 0.0625, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 5366 }, { "completion_length": 573.75, "epoch": 1.4875277161862528, "grad_norm": 0.0, "kl": 0.22674091160297394, "learning_rate": 4.010939785238454e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5367 }, { "completion_length": 632.0, "epoch": 1.4878048780487805, "grad_norm": 0.0, "kl": 0.18788504600524902, "learning_rate": 4.010590973802737e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5368 }, { "completion_length": 664.75, "epoch": 1.4880820399113082, "grad_norm": 0.0, "kl": 0.16478991508483887, "learning_rate": 4.010242116043478e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5369 }, { "completion_length": 562.75, "epoch": 1.4883592017738358, "grad_norm": 0.0, "kl": 0.18860621750354767, "learning_rate": 4.009893211971376e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5370 }, { "completion_length": 601.25, "epoch": 1.4886363636363638, "grad_norm": 0.0, "kl": 0.18622691929340363, "learning_rate": 4.009544261597131e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5371 }, { "completion_length": 628.5, "epoch": 1.4889135254988912, "grad_norm": 0.0, "kl": 0.2131623923778534, "learning_rate": 4.009195264931443e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5372 }, { "completion_length": 519.25, "epoch": 1.4891906873614191, "grad_norm": 0.0, "kl": 0.1700993925333023, "learning_rate": 4.008846221985013e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5373 }, { "completion_length": 562.0, "epoch": 1.4894678492239468, "grad_norm": 0.0, "kl": 0.18044984340667725, "learning_rate": 4.008497132768548e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5374 }, { "completion_length": 630.0, "epoch": 1.4897450110864745, "grad_norm": 0.0, "kl": 0.317425012588501, "learning_rate": 4.008147997292749e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5375 }, { "completion_length": 549.0, "epoch": 1.4900221729490022, "grad_norm": 0.0, "kl": 0.1812727153301239, "learning_rate": 4.007798815568326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5376 }, { "completion_length": 507.75, "epoch": 1.4902993348115299, "grad_norm": 0.0, "kl": 0.18152165412902832, "learning_rate": 4.007449587605985e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5377 }, { "completion_length": 651.5, "epoch": 1.4905764966740578, "grad_norm": 0.0, "kl": 0.15620996057987213, "learning_rate": 4.007100313416435e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5378 }, { "completion_length": 605.0, "epoch": 1.4908536585365852, "grad_norm": 0.0, "kl": 0.16753451526165009, "learning_rate": 4.00675099301039e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5379 }, { "completion_length": 562.5, "epoch": 1.4911308203991132, "grad_norm": 0.4372571110725403, "kl": 659.1333618164062, "learning_rate": 4.006401626398559e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5380 }, { "completion_length": 642.25, "epoch": 1.4914079822616408, "grad_norm": 0.0, "kl": 0.19249075651168823, "learning_rate": 4.006052213591657e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5381 }, { "completion_length": 659.25, "epoch": 1.4916851441241685, "grad_norm": 0.0, "kl": 0.21985343098640442, "learning_rate": 4.005702754600398e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5382 }, { "completion_length": 638.0, "epoch": 1.4919623059866962, "grad_norm": 0.0, "kl": 1.691198468208313, "learning_rate": 4.0053532494354985e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5383 }, { "completion_length": 673.5, "epoch": 1.492239467849224, "grad_norm": 0.0, "kl": 0.24279500544071198, "learning_rate": 4.005003698107678e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5384 }, { "completion_length": 629.5, "epoch": 1.4925166297117516, "grad_norm": 0.0, "kl": 0.4599900543689728, "learning_rate": 4.004654100627655e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5385 }, { "completion_length": 667.25, "epoch": 1.4927937915742793, "grad_norm": 0.5330818891525269, "kl": 22.27869987487793, "learning_rate": 4.004304457006148e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5386 }, { "completion_length": 573.75, "epoch": 1.4930709534368072, "grad_norm": 0.0, "kl": 0.5657882690429688, "learning_rate": 4.003954767253883e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5387 }, { "completion_length": 632.0, "epoch": 1.4933481152993349, "grad_norm": 0.0, "kl": 0.22589468955993652, "learning_rate": 4.003605031381581e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5388 }, { "completion_length": 622.0, "epoch": 1.4936252771618626, "grad_norm": 0.0, "kl": 0.15612035989761353, "learning_rate": 4.003255249399967e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5389 }, { "completion_length": 679.75, "epoch": 1.4939024390243902, "grad_norm": 0.0, "kl": 0.14914259314537048, "learning_rate": 4.00290542131977e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5390 }, { "completion_length": 650.0, "epoch": 1.494179600886918, "grad_norm": 0.0, "kl": 0.16801787912845612, "learning_rate": 4.002555547151713e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5391 }, { "completion_length": 617.0, "epoch": 1.4944567627494456, "grad_norm": 0.0, "kl": 0.14959043264389038, "learning_rate": 4.002205626906529e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5392 }, { "completion_length": 617.0, "epoch": 1.4947339246119733, "grad_norm": 0.0, "kl": 0.1648213267326355, "learning_rate": 4.001855660594948e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5393 }, { "completion_length": 599.5, "epoch": 1.4950110864745012, "grad_norm": 0.0, "kl": 0.22341115772724152, "learning_rate": 4.0015056482277e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5394 }, { "completion_length": 546.75, "epoch": 1.495288248337029, "grad_norm": 0.0, "kl": 0.22447988390922546, "learning_rate": 4.001155589815521e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5395 }, { "completion_length": 478.0, "epoch": 1.4955654101995566, "grad_norm": 0.0, "kl": 0.20576418936252594, "learning_rate": 4.000805485369145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5396 }, { "completion_length": 665.75, "epoch": 1.4958425720620843, "grad_norm": 0.0, "kl": 0.2759302258491516, "learning_rate": 4.000455334899307e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5397 }, { "completion_length": 552.0, "epoch": 1.496119733924612, "grad_norm": 0.0, "kl": 0.1762852668762207, "learning_rate": 4.000105138416746e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5398 }, { "completion_length": 638.0, "epoch": 1.4963968957871396, "grad_norm": 0.46031802892684937, "kl": 0.8912643194198608, "learning_rate": 3.999754895932201e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5399 }, { "completion_length": 528.0, "epoch": 1.4966740576496673, "grad_norm": 0.0, "kl": 0.7877105474472046, "learning_rate": 3.9994046074564115e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5400 }, { "completion_length": 565.5, "epoch": 1.4969512195121952, "grad_norm": 0.0, "kl": 0.17440678179264069, "learning_rate": 3.99905427300012e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5401 }, { "completion_length": 653.75, "epoch": 1.497228381374723, "grad_norm": 0.0, "kl": 0.16979436576366425, "learning_rate": 3.99870389257407e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5402 }, { "completion_length": 602.0, "epoch": 1.4975055432372506, "grad_norm": 0.0, "kl": 0.16698873043060303, "learning_rate": 3.998353466189007e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5403 }, { "completion_length": 650.5, "epoch": 1.4977827050997783, "grad_norm": 0.0, "kl": 0.6319389939308167, "learning_rate": 3.998002993855676e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5404 }, { "completion_length": 606.5, "epoch": 1.498059866962306, "grad_norm": 0.0, "kl": 0.20439369976520538, "learning_rate": 3.997652475584824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5405 }, { "completion_length": 572.25, "epoch": 1.4983370288248337, "grad_norm": 0.0, "kl": 0.1992768496274948, "learning_rate": 3.9973019113872004e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5406 }, { "completion_length": 668.75, "epoch": 1.4986141906873613, "grad_norm": 0.4240284562110901, "kl": 0.5366332530975342, "learning_rate": 3.996951301273556e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5407 }, { "completion_length": 559.5, "epoch": 1.4988913525498893, "grad_norm": 0.0, "kl": 0.19472196698188782, "learning_rate": 3.996600645254643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5408 }, { "completion_length": 581.5, "epoch": 1.4991685144124167, "grad_norm": 0.0, "kl": 0.15629614889621735, "learning_rate": 3.996249943341214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5409 }, { "completion_length": 592.0, "epoch": 1.4994456762749446, "grad_norm": 0.0, "kl": 0.18477018177509308, "learning_rate": 3.995899195544023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5410 }, { "completion_length": 556.0, "epoch": 1.4997228381374723, "grad_norm": 0.0, "kl": 0.14509853720664978, "learning_rate": 3.995548401873827e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5411 }, { "completion_length": 540.5, "epoch": 1.5, "grad_norm": 0.0, "kl": 0.2087395340204239, "learning_rate": 3.995197562341382e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5412 }, { "completion_length": 577.75, "epoch": 1.5002771618625277, "grad_norm": 0.0, "kl": 0.18658217787742615, "learning_rate": 3.994846676957448e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5413 }, { "completion_length": 624.25, "epoch": 1.5005543237250554, "grad_norm": 0.7724356651306152, "kl": 0.25080177187919617, "learning_rate": 3.994495745732785e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5414 }, { "completion_length": 532.0, "epoch": 1.5008314855875833, "grad_norm": 0.0, "kl": 0.196329727768898, "learning_rate": 3.994144768678154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5415 }, { "completion_length": 628.25, "epoch": 1.5011086474501107, "grad_norm": 0.0, "kl": 0.20293483138084412, "learning_rate": 3.993793745804318e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5416 }, { "completion_length": 609.25, "epoch": 1.5013858093126387, "grad_norm": 0.4353806972503662, "kl": 8.719454765319824, "learning_rate": 3.993442677122043e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5417 }, { "completion_length": 605.25, "epoch": 1.5016629711751663, "grad_norm": 0.0, "kl": 0.17055699229240417, "learning_rate": 3.993091562642093e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5418 }, { "completion_length": 652.25, "epoch": 1.501940133037694, "grad_norm": 0.0, "kl": 0.1710515320301056, "learning_rate": 3.992740402375236e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5419 }, { "completion_length": 627.75, "epoch": 1.5022172949002217, "grad_norm": 0.0, "kl": 0.15463055670261383, "learning_rate": 3.992389196332241e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5420 }, { "completion_length": 643.5, "epoch": 1.5024944567627494, "grad_norm": 0.0, "kl": 0.17175669968128204, "learning_rate": 3.992037944523877e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5421 }, { "completion_length": 623.0, "epoch": 1.5027716186252773, "grad_norm": 0.0, "kl": 0.18324559926986694, "learning_rate": 3.991686646960916e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5422 }, { "completion_length": 651.75, "epoch": 1.5030487804878048, "grad_norm": 0.0, "kl": 0.1531716287136078, "learning_rate": 3.991335303654131e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5423 }, { "completion_length": 676.25, "epoch": 1.5033259423503327, "grad_norm": 0.0, "kl": 0.15859022736549377, "learning_rate": 3.990983914614296e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5424 }, { "completion_length": 581.0, "epoch": 1.5036031042128604, "grad_norm": 0.0, "kl": 0.1785711646080017, "learning_rate": 3.9906324798521864e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5425 }, { "completion_length": 592.25, "epoch": 1.503880266075388, "grad_norm": 0.0, "kl": 0.17168863117694855, "learning_rate": 3.99028099937858e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5426 }, { "completion_length": 585.5, "epoch": 1.5041574279379157, "grad_norm": 0.0, "kl": 0.20809602737426758, "learning_rate": 3.9899294732042546e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5427 }, { "completion_length": 601.5, "epoch": 1.5044345898004434, "grad_norm": 0.45425310730934143, "kl": 0.21587419509887695, "learning_rate": 3.989577901339989e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5428 }, { "completion_length": 578.75, "epoch": 1.5047117516629713, "grad_norm": 0.43921056389808655, "kl": 6.907687664031982, "learning_rate": 3.989226283796567e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5429 }, { "completion_length": 627.25, "epoch": 1.5049889135254988, "grad_norm": 0.0, "kl": 0.18120990693569183, "learning_rate": 3.98887462058477e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5430 }, { "completion_length": 550.25, "epoch": 1.5052660753880267, "grad_norm": 0.4805491864681244, "kl": 0.21430106461048126, "learning_rate": 3.988522911715381e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5431 }, { "completion_length": 595.25, "epoch": 1.5055432372505542, "grad_norm": 0.0, "kl": 0.2623163163661957, "learning_rate": 3.988171157199188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5432 }, { "completion_length": 608.5, "epoch": 1.505820399113082, "grad_norm": 0.0, "kl": 0.19676639139652252, "learning_rate": 3.987819357046975e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5433 }, { "completion_length": 660.5, "epoch": 1.5060975609756098, "grad_norm": 0.0, "kl": 0.15488463640213013, "learning_rate": 3.987467511269532e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5434 }, { "completion_length": 536.25, "epoch": 1.5063747228381374, "grad_norm": 0.0, "kl": 0.17228607833385468, "learning_rate": 3.987115619877648e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5435 }, { "completion_length": 597.25, "epoch": 1.5066518847006651, "grad_norm": 0.0, "kl": 0.22726650536060333, "learning_rate": 3.9867636828821135e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5436 }, { "completion_length": 657.0, "epoch": 1.5069290465631928, "grad_norm": 0.0, "kl": 0.18237143754959106, "learning_rate": 3.9864117002937215e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5437 }, { "completion_length": 824.75, "epoch": 1.5072062084257207, "grad_norm": 0.0, "kl": 0.17572945356369019, "learning_rate": 3.986059672123267e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5438 }, { "completion_length": 507.5, "epoch": 1.5074833702882482, "grad_norm": 0.0, "kl": 0.26642298698425293, "learning_rate": 3.985707598381544e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5439 }, { "completion_length": 618.5, "epoch": 1.507760532150776, "grad_norm": 0.0, "kl": 0.1721702516078949, "learning_rate": 3.985355479079349e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5440 }, { "completion_length": 620.75, "epoch": 1.5080376940133038, "grad_norm": 0.0, "kl": 0.22101785242557526, "learning_rate": 3.985003314227481e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5441 }, { "completion_length": 606.25, "epoch": 1.5083148558758315, "grad_norm": 0.0, "kl": 0.3611709773540497, "learning_rate": 3.9846511038367384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5442 }, { "completion_length": 646.5, "epoch": 1.5085920177383592, "grad_norm": 0.0, "kl": 0.2189493477344513, "learning_rate": 3.984298847917923e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5443 }, { "completion_length": 585.75, "epoch": 1.5088691796008868, "grad_norm": 0.0, "kl": 0.17927637696266174, "learning_rate": 3.983946546481835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5444 }, { "completion_length": 620.75, "epoch": 1.5091463414634148, "grad_norm": 0.0, "kl": 0.19596709311008453, "learning_rate": 3.9835941995392806e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5445 }, { "completion_length": 590.25, "epoch": 1.5094235033259422, "grad_norm": 0.0, "kl": 0.41113027930259705, "learning_rate": 3.983241807101064e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5446 }, { "completion_length": 686.5, "epoch": 1.5097006651884701, "grad_norm": 0.0, "kl": 0.15536227822303772, "learning_rate": 3.98288936917799e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5447 }, { "completion_length": 588.5, "epoch": 1.5099778270509978, "grad_norm": 0.0, "kl": 26.80759620666504, "learning_rate": 3.982536885780869e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5448 }, { "completion_length": 540.5, "epoch": 1.5102549889135255, "grad_norm": 0.0, "kl": 0.25138601660728455, "learning_rate": 3.982184356920508e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5449 }, { "completion_length": 625.25, "epoch": 1.5105321507760532, "grad_norm": 0.0, "kl": 0.30323851108551025, "learning_rate": 3.981831782607719e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5450 }, { "completion_length": 591.75, "epoch": 1.5108093126385809, "grad_norm": 0.0, "kl": 0.2144877016544342, "learning_rate": 3.981479162853313e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5451 }, { "completion_length": 658.25, "epoch": 1.5110864745011088, "grad_norm": 0.0, "kl": 0.2278285175561905, "learning_rate": 3.981126497668105e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5452 }, { "completion_length": 588.75, "epoch": 1.5113636363636362, "grad_norm": 0.3861635625362396, "kl": 0.19160069525241852, "learning_rate": 3.980773787062907e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5453 }, { "completion_length": 620.75, "epoch": 1.5116407982261642, "grad_norm": 0.0, "kl": 1.3941066265106201, "learning_rate": 3.980421031048538e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5454 }, { "completion_length": 600.25, "epoch": 1.5119179600886918, "grad_norm": 0.0, "kl": 0.16320401430130005, "learning_rate": 3.980068229635814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5455 }, { "completion_length": 671.5, "epoch": 1.5121951219512195, "grad_norm": 0.0, "kl": 0.15168684720993042, "learning_rate": 3.979715382835555e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5456 }, { "completion_length": 548.0, "epoch": 1.5124722838137472, "grad_norm": 0.6986731290817261, "kl": 2.7289083003997803, "learning_rate": 3.9793624906585805e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5457 }, { "completion_length": 580.0, "epoch": 1.512749445676275, "grad_norm": 0.0, "kl": 0.3111129701137543, "learning_rate": 3.979009553115711e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5458 }, { "completion_length": 613.75, "epoch": 1.5130266075388028, "grad_norm": 0.0, "kl": 0.18861816823482513, "learning_rate": 3.9786565702177725e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5459 }, { "completion_length": 644.5, "epoch": 1.5133037694013303, "grad_norm": 0.4658026099205017, "kl": 0.29563549160957336, "learning_rate": 3.978303541975588e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5460 }, { "completion_length": 596.0, "epoch": 1.5135809312638582, "grad_norm": 0.0, "kl": 0.18594065308570862, "learning_rate": 3.9779504683999835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5461 }, { "completion_length": 710.75, "epoch": 1.5138580931263859, "grad_norm": 0.0, "kl": 0.31853216886520386, "learning_rate": 3.977597349501785e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5462 }, { "completion_length": 776.75, "epoch": 1.5141352549889135, "grad_norm": 0.0, "kl": 0.1952134221792221, "learning_rate": 3.977244185291824e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5463 }, { "completion_length": 660.25, "epoch": 1.5144124168514412, "grad_norm": 0.537773847579956, "kl": 0.6368292570114136, "learning_rate": 3.976890975780928e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5464 }, { "completion_length": 649.25, "epoch": 1.514689578713969, "grad_norm": 0.0, "kl": 0.1987643837928772, "learning_rate": 3.9765377209799294e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5465 }, { "completion_length": 692.75, "epoch": 1.5149667405764968, "grad_norm": 0.0, "kl": 0.17363931238651276, "learning_rate": 3.976184420899662e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5466 }, { "completion_length": 610.5, "epoch": 1.5152439024390243, "grad_norm": 0.0, "kl": 0.23868650197982788, "learning_rate": 3.975831075550959e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5467 }, { "completion_length": 713.25, "epoch": 1.5155210643015522, "grad_norm": 0.0, "kl": 25.2901611328125, "learning_rate": 3.9754776849446554e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5468 }, { "completion_length": 669.5, "epoch": 1.5157982261640797, "grad_norm": 0.37102916836738586, "kl": 0.19911238551139832, "learning_rate": 3.97512424909159e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5469 }, { "completion_length": 622.75, "epoch": 1.5160753880266076, "grad_norm": 0.0, "kl": 1.6121232509613037, "learning_rate": 3.974770768002599e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5470 }, { "completion_length": 597.25, "epoch": 1.5163525498891353, "grad_norm": 0.0, "kl": 0.3645458221435547, "learning_rate": 3.974417241688525e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5471 }, { "completion_length": 624.25, "epoch": 1.516629711751663, "grad_norm": 0.0, "kl": 0.18742592632770538, "learning_rate": 3.9740636701602065e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5472 }, { "completion_length": 652.5, "epoch": 1.5169068736141909, "grad_norm": 0.0, "kl": 0.1663975864648819, "learning_rate": 3.973710053428487e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5473 }, { "completion_length": 602.0, "epoch": 1.5171840354767183, "grad_norm": 0.0, "kl": 0.19888296723365784, "learning_rate": 3.9733563915042114e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5474 }, { "completion_length": 570.75, "epoch": 1.5174611973392462, "grad_norm": 0.0, "kl": 0.21520736813545227, "learning_rate": 3.973002684398225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5475 }, { "completion_length": 599.25, "epoch": 1.5177383592017737, "grad_norm": 0.0, "kl": 0.459173321723938, "learning_rate": 3.972648932121372e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5476 }, { "completion_length": 962.5, "epoch": 1.5180155210643016, "grad_norm": 0.21307472884655, "kl": 0.15974298119544983, "learning_rate": 3.972295134684504e-06, "loss": 0.0, "reward": 4.09375, "reward_std": 3.3125, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 5477 }, { "completion_length": 614.5, "epoch": 1.5182926829268293, "grad_norm": 0.0, "kl": 0.19287532567977905, "learning_rate": 3.971941292098468e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5478 }, { "completion_length": 663.75, "epoch": 1.518569844789357, "grad_norm": 0.0, "kl": 0.18049567937850952, "learning_rate": 3.971587404374116e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5479 }, { "completion_length": 751.0, "epoch": 1.5188470066518847, "grad_norm": 0.39372196793556213, "kl": 0.187201127409935, "learning_rate": 3.9712334715222985e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5480 }, { "completion_length": 681.5, "epoch": 1.5191241685144123, "grad_norm": 0.4467040002346039, "kl": 0.18084800243377686, "learning_rate": 3.970879493553873e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5481 }, { "completion_length": 589.0, "epoch": 1.5194013303769403, "grad_norm": 0.0, "kl": 0.18548651039600372, "learning_rate": 3.970525470479691e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5482 }, { "completion_length": 671.0, "epoch": 1.5196784922394677, "grad_norm": 0.0, "kl": 0.24099193513393402, "learning_rate": 3.9701714023106095e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5483 }, { "completion_length": 613.0, "epoch": 1.5199556541019956, "grad_norm": 0.0, "kl": 0.3303147852420807, "learning_rate": 3.969817289057487e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5484 }, { "completion_length": 598.5, "epoch": 1.5202328159645233, "grad_norm": 0.0, "kl": 0.2068192958831787, "learning_rate": 3.969463130731183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5485 }, { "completion_length": 647.75, "epoch": 1.520509977827051, "grad_norm": 0.5372950434684753, "kl": 0.17915335297584534, "learning_rate": 3.969108927342558e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5486 }, { "completion_length": 646.5, "epoch": 1.5207871396895787, "grad_norm": 0.0, "kl": 0.17155821621418, "learning_rate": 3.968754678902473e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5487 }, { "completion_length": 659.75, "epoch": 1.5210643015521064, "grad_norm": 0.0, "kl": 0.16988873481750488, "learning_rate": 3.968400385421791e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5488 }, { "completion_length": 638.5, "epoch": 1.5213414634146343, "grad_norm": 0.0, "kl": 0.22257724404335022, "learning_rate": 3.968046046911379e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5489 }, { "completion_length": 681.25, "epoch": 1.5216186252771617, "grad_norm": 0.0, "kl": 0.22322100400924683, "learning_rate": 3.967691663382101e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5490 }, { "completion_length": 633.0, "epoch": 1.5218957871396896, "grad_norm": 0.3870393633842468, "kl": 0.1511252373456955, "learning_rate": 3.967337234844826e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5491 }, { "completion_length": 620.0, "epoch": 1.5221729490022173, "grad_norm": 0.0, "kl": 0.25740599632263184, "learning_rate": 3.9669827613104206e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5492 }, { "completion_length": 789.25, "epoch": 1.522450110864745, "grad_norm": 0.0, "kl": 0.15879744291305542, "learning_rate": 3.9666282427897574e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5493 }, { "completion_length": 589.25, "epoch": 1.5227272727272727, "grad_norm": 0.0, "kl": 0.20242349803447723, "learning_rate": 3.9662736792937065e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5494 }, { "completion_length": 701.25, "epoch": 1.5230044345898004, "grad_norm": 0.0, "kl": 0.14607320725917816, "learning_rate": 3.965919070833141e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5495 }, { "completion_length": 616.0, "epoch": 1.5232815964523283, "grad_norm": 0.0, "kl": 0.2049543410539627, "learning_rate": 3.965564417418936e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5496 }, { "completion_length": 644.75, "epoch": 1.5235587583148558, "grad_norm": 0.0, "kl": 0.18389809131622314, "learning_rate": 3.965209719061967e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5497 }, { "completion_length": 647.5, "epoch": 1.5238359201773837, "grad_norm": 0.0, "kl": 0.17189614474773407, "learning_rate": 3.964854975773112e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5498 }, { "completion_length": 678.5, "epoch": 1.5241130820399114, "grad_norm": 0.0, "kl": 0.1468493938446045, "learning_rate": 3.964500187563248e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5499 }, { "completion_length": 601.25, "epoch": 1.524390243902439, "grad_norm": 0.0, "kl": 0.25857606530189514, "learning_rate": 3.964145354443255e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5500 }, { "completion_length": 843.0, "epoch": 1.5246674057649667, "grad_norm": 0.0, "kl": 0.18753188848495483, "learning_rate": 3.963790476424014e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5501 }, { "completion_length": 649.25, "epoch": 1.5249445676274944, "grad_norm": 0.0, "kl": 0.22246351838111877, "learning_rate": 3.9634355535164095e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5502 }, { "completion_length": 682.25, "epoch": 1.5252217294900223, "grad_norm": 0.39356204867362976, "kl": 7.623286247253418, "learning_rate": 3.963080585731324e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5503 }, { "completion_length": 615.0, "epoch": 1.5254988913525498, "grad_norm": 0.3926834464073181, "kl": 0.220685675740242, "learning_rate": 3.962725573079644e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5504 }, { "completion_length": 638.0, "epoch": 1.5257760532150777, "grad_norm": 0.0, "kl": 0.18134236335754395, "learning_rate": 3.962370515572254e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5505 }, { "completion_length": 647.5, "epoch": 1.5260532150776052, "grad_norm": 0.0, "kl": 0.32140836119651794, "learning_rate": 3.962015413220044e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5506 }, { "completion_length": 656.25, "epoch": 1.526330376940133, "grad_norm": 0.0, "kl": 0.2762204110622406, "learning_rate": 3.9616602660339035e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5507 }, { "completion_length": 675.75, "epoch": 1.5266075388026608, "grad_norm": 0.4169880151748657, "kl": 0.23180246353149414, "learning_rate": 3.961305074024722e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5508 }, { "completion_length": 621.25, "epoch": 1.5268847006651884, "grad_norm": 0.0, "kl": 0.18400011956691742, "learning_rate": 3.960949837203394e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5509 }, { "completion_length": 568.75, "epoch": 1.5271618625277164, "grad_norm": 0.0, "kl": 2.6777169704437256, "learning_rate": 3.960594555580811e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5510 }, { "completion_length": 680.75, "epoch": 1.5274390243902438, "grad_norm": 0.0, "kl": 0.21204937994480133, "learning_rate": 3.960239229167869e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5511 }, { "completion_length": 755.75, "epoch": 1.5277161862527717, "grad_norm": 0.0, "kl": 0.1954176425933838, "learning_rate": 3.959883857975465e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5512 }, { "completion_length": 713.5, "epoch": 1.5279933481152992, "grad_norm": 0.4643799364566803, "kl": 0.8421099781990051, "learning_rate": 3.9595284420144965e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5513 }, { "completion_length": 671.0, "epoch": 1.528270509977827, "grad_norm": 0.0, "kl": 0.19186966121196747, "learning_rate": 3.959172981295861e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5514 }, { "completion_length": 585.25, "epoch": 1.5285476718403548, "grad_norm": 0.0, "kl": 0.20234961807727814, "learning_rate": 3.95881747583046e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5515 }, { "completion_length": 596.75, "epoch": 1.5288248337028825, "grad_norm": 0.0, "kl": 0.1724032312631607, "learning_rate": 3.9584619256291965e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5516 }, { "completion_length": 645.75, "epoch": 1.5291019955654102, "grad_norm": 0.0, "kl": 0.181804820895195, "learning_rate": 3.958106330702972e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5517 }, { "completion_length": 651.0, "epoch": 1.5293791574279378, "grad_norm": 0.0, "kl": 0.32825446128845215, "learning_rate": 3.957750691062693e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5518 }, { "completion_length": 578.75, "epoch": 1.5296563192904657, "grad_norm": 0.0, "kl": 0.2088823765516281, "learning_rate": 3.9573950067192626e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5519 }, { "completion_length": 568.0, "epoch": 1.5299334811529932, "grad_norm": 0.7722257375717163, "kl": 0.16913066804409027, "learning_rate": 3.957039277683592e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5520 }, { "completion_length": 740.5, "epoch": 1.5302106430155211, "grad_norm": 0.0, "kl": 0.15721000730991364, "learning_rate": 3.956683503966586e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5521 }, { "completion_length": 507.5, "epoch": 1.5304878048780488, "grad_norm": 0.0, "kl": 0.2831372916698456, "learning_rate": 3.956327685579158e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5522 }, { "completion_length": 672.25, "epoch": 1.5307649667405765, "grad_norm": 0.40818214416503906, "kl": 0.20406685769557953, "learning_rate": 3.9559718225322174e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5523 }, { "completion_length": 548.75, "epoch": 1.5310421286031042, "grad_norm": 0.0, "kl": 0.205659419298172, "learning_rate": 3.955615914836678e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5524 }, { "completion_length": 571.25, "epoch": 1.5313192904656319, "grad_norm": 0.0, "kl": 0.205240860581398, "learning_rate": 3.955259962503453e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5525 }, { "completion_length": 603.25, "epoch": 1.5315964523281598, "grad_norm": 0.49865397810935974, "kl": 0.15640492737293243, "learning_rate": 3.95490396554346e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5526 }, { "completion_length": 685.5, "epoch": 1.5318736141906872, "grad_norm": 0.0, "kl": 0.19536571204662323, "learning_rate": 3.9545479239676135e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5527 }, { "completion_length": 725.75, "epoch": 1.5321507760532151, "grad_norm": 0.0, "kl": 0.1793302595615387, "learning_rate": 3.954191837786834e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5528 }, { "completion_length": 804.75, "epoch": 1.5324279379157428, "grad_norm": 0.0, "kl": 0.13738755881786346, "learning_rate": 3.953835707012039e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5529 }, { "completion_length": 655.0, "epoch": 1.5327050997782705, "grad_norm": 0.0, "kl": 0.40722978115081787, "learning_rate": 3.953479531654151e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5530 }, { "completion_length": 579.5, "epoch": 1.5329822616407982, "grad_norm": 0.0, "kl": 0.22953550517559052, "learning_rate": 3.953123311724092e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5531 }, { "completion_length": 672.5, "epoch": 1.533259423503326, "grad_norm": 0.0, "kl": 0.7425042986869812, "learning_rate": 3.952767047232786e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5532 }, { "completion_length": 577.25, "epoch": 1.5335365853658538, "grad_norm": 0.0, "kl": 0.21873873472213745, "learning_rate": 3.952410738191158e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5533 }, { "completion_length": 536.5, "epoch": 1.5338137472283813, "grad_norm": 0.0, "kl": 0.2038709819316864, "learning_rate": 3.952054384610134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5534 }, { "completion_length": 601.75, "epoch": 1.5340909090909092, "grad_norm": 0.0, "kl": 0.21484850347042084, "learning_rate": 3.951697986500643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5535 }, { "completion_length": 591.0, "epoch": 1.5343680709534369, "grad_norm": 0.4170073866844177, "kl": 0.2664703130722046, "learning_rate": 3.951341543873615e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5536 }, { "completion_length": 690.0, "epoch": 1.5346452328159645, "grad_norm": 0.36694908142089844, "kl": 0.2254546731710434, "learning_rate": 3.950985056739978e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5537 }, { "completion_length": 642.0, "epoch": 1.5349223946784922, "grad_norm": 0.41172537207603455, "kl": 0.31769639253616333, "learning_rate": 3.950628525110665e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5538 }, { "completion_length": 629.25, "epoch": 1.53519955654102, "grad_norm": 0.4282776117324829, "kl": 0.15973396599292755, "learning_rate": 3.950271948996609e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5539 }, { "completion_length": 558.0, "epoch": 1.5354767184035478, "grad_norm": 0.0, "kl": 0.17348435521125793, "learning_rate": 3.9499153284087465e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5540 }, { "completion_length": 619.0, "epoch": 1.5357538802660753, "grad_norm": 0.0, "kl": 0.2509979009628296, "learning_rate": 3.949558663358012e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5541 }, { "completion_length": 692.0, "epoch": 1.5360310421286032, "grad_norm": 0.0, "kl": 0.1707874834537506, "learning_rate": 3.949201953855343e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5542 }, { "completion_length": 610.75, "epoch": 1.5363082039911307, "grad_norm": 0.0, "kl": 0.18852482736110687, "learning_rate": 3.94884519991168e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5543 }, { "completion_length": 723.0, "epoch": 1.5365853658536586, "grad_norm": 0.0, "kl": 0.17603172361850739, "learning_rate": 3.94848840153796e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5544 }, { "completion_length": 585.75, "epoch": 1.5368625277161863, "grad_norm": 0.0, "kl": 0.3295336663722992, "learning_rate": 3.948131558745127e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5545 }, { "completion_length": 590.25, "epoch": 1.537139689578714, "grad_norm": 0.0, "kl": 0.29656198620796204, "learning_rate": 3.947774671544124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5546 }, { "completion_length": 651.75, "epoch": 1.5374168514412418, "grad_norm": 0.0, "kl": 0.20820343494415283, "learning_rate": 3.9474177399458925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5547 }, { "completion_length": 656.75, "epoch": 1.5376940133037693, "grad_norm": 0.0, "kl": 0.18529343605041504, "learning_rate": 3.9470607639613815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5548 }, { "completion_length": 560.25, "epoch": 1.5379711751662972, "grad_norm": 0.4612239599227905, "kl": 0.2828051745891571, "learning_rate": 3.946703743601536e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5549 }, { "completion_length": 604.25, "epoch": 1.5382483370288247, "grad_norm": 0.5335352420806885, "kl": 0.17466327548027039, "learning_rate": 3.946346678877305e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5550 }, { "completion_length": 556.0, "epoch": 1.5385254988913526, "grad_norm": 0.0, "kl": 0.21791504323482513, "learning_rate": 3.945989569799638e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5551 }, { "completion_length": 662.0, "epoch": 1.5388026607538803, "grad_norm": 0.0, "kl": 0.18695731461048126, "learning_rate": 3.945632416379486e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5552 }, { "completion_length": 653.75, "epoch": 1.539079822616408, "grad_norm": 0.4207034409046173, "kl": 0.17663322389125824, "learning_rate": 3.945275218627801e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5553 }, { "completion_length": 594.0, "epoch": 1.5393569844789357, "grad_norm": 0.5366207361221313, "kl": 1.2424845695495605, "learning_rate": 3.944917976555538e-06, "loss": -0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5554 }, { "completion_length": 622.0, "epoch": 1.5396341463414633, "grad_norm": 0.0, "kl": 0.22016695141792297, "learning_rate": 3.944560690173651e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5555 }, { "completion_length": 647.75, "epoch": 1.5399113082039912, "grad_norm": 0.0, "kl": 0.15283650159835815, "learning_rate": 3.944203359493097e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5556 }, { "completion_length": 699.0, "epoch": 1.5401884700665187, "grad_norm": 0.0, "kl": 0.17698794603347778, "learning_rate": 3.943845984524834e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5557 }, { "completion_length": 583.25, "epoch": 1.5404656319290466, "grad_norm": 0.0, "kl": 0.25207725167274475, "learning_rate": 3.94348856527982e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5558 }, { "completion_length": 663.5, "epoch": 1.5407427937915743, "grad_norm": 0.0, "kl": 0.1816461980342865, "learning_rate": 3.943131101769017e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5559 }, { "completion_length": 475.75, "epoch": 1.541019955654102, "grad_norm": 0.0, "kl": 0.18350911140441895, "learning_rate": 3.942773594003386e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5560 }, { "completion_length": 621.0, "epoch": 1.5412971175166297, "grad_norm": 0.6746737360954285, "kl": 0.4635527729988098, "learning_rate": 3.942416041993891e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5561 }, { "completion_length": 634.25, "epoch": 1.5415742793791574, "grad_norm": 0.0, "kl": 0.16856470704078674, "learning_rate": 3.942058445751497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5562 }, { "completion_length": 624.0, "epoch": 1.5418514412416853, "grad_norm": 0.5298125147819519, "kl": 0.14674308896064758, "learning_rate": 3.941700805287169e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5563 }, { "completion_length": 653.0, "epoch": 1.5421286031042127, "grad_norm": 0.0, "kl": 0.19535231590270996, "learning_rate": 3.941343120611873e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5564 }, { "completion_length": 652.75, "epoch": 1.5424057649667406, "grad_norm": 0.0, "kl": 0.24941478669643402, "learning_rate": 3.940985391736582e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5565 }, { "completion_length": 618.5, "epoch": 1.5426829268292683, "grad_norm": 0.0, "kl": 0.22976742684841156, "learning_rate": 3.9406276186722604e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5566 }, { "completion_length": 597.0, "epoch": 1.542960088691796, "grad_norm": 0.0, "kl": 0.5940783023834229, "learning_rate": 3.940269801429885e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5567 }, { "completion_length": 617.5, "epoch": 1.5432372505543237, "grad_norm": 0.0, "kl": 0.21111033856868744, "learning_rate": 3.939911940020425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5568 }, { "completion_length": 595.0, "epoch": 1.5435144124168514, "grad_norm": 0.0, "kl": 0.21236859261989594, "learning_rate": 3.939554034454856e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5569 }, { "completion_length": 644.75, "epoch": 1.5437915742793793, "grad_norm": 0.0, "kl": 0.3049170672893524, "learning_rate": 3.939196084744153e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5570 }, { "completion_length": 614.0, "epoch": 1.5440687361419068, "grad_norm": 0.0, "kl": 0.18922971189022064, "learning_rate": 3.9388380908992925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5571 }, { "completion_length": 591.0, "epoch": 1.5443458980044347, "grad_norm": 0.3795190155506134, "kl": 4803.7001953125, "learning_rate": 3.938480052931255e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5572 }, { "completion_length": 795.5, "epoch": 1.5446230598669624, "grad_norm": 0.0, "kl": 0.18572217226028442, "learning_rate": 3.938121970851017e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5573 }, { "completion_length": 740.25, "epoch": 1.54490022172949, "grad_norm": 0.4388229548931122, "kl": 30.161773681640625, "learning_rate": 3.9377638446695604e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5574 }, { "completion_length": 661.75, "epoch": 1.5451773835920177, "grad_norm": 0.4587065577507019, "kl": 0.29686111211776733, "learning_rate": 3.937405674397868e-06, "loss": 0.0, "reward": 1.4375, "reward_std": 0.625, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 5575 }, { "completion_length": 732.5, "epoch": 1.5454545454545454, "grad_norm": 0.3855379819869995, "kl": 0.15899769961833954, "learning_rate": 3.937047460046923e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5576 }, { "completion_length": 670.0, "epoch": 1.5457317073170733, "grad_norm": 0.0, "kl": 0.6056275963783264, "learning_rate": 3.93668920162771e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5577 }, { "completion_length": 612.25, "epoch": 1.5460088691796008, "grad_norm": 0.0, "kl": 0.336734801530838, "learning_rate": 3.936330899151216e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5578 }, { "completion_length": 752.25, "epoch": 1.5462860310421287, "grad_norm": 0.0, "kl": 0.2727248966693878, "learning_rate": 3.935972552628428e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5579 }, { "completion_length": 570.25, "epoch": 1.5465631929046562, "grad_norm": 0.0, "kl": 0.2712984085083008, "learning_rate": 3.9356141620703355e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5580 }, { "completion_length": 569.5, "epoch": 1.546840354767184, "grad_norm": 0.0, "kl": 0.47915512323379517, "learning_rate": 3.9352557274879285e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5581 }, { "completion_length": 777.25, "epoch": 1.5471175166297118, "grad_norm": 0.0, "kl": 0.4363420009613037, "learning_rate": 3.934897248892199e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5582 }, { "completion_length": 619.75, "epoch": 1.5473946784922394, "grad_norm": 0.0, "kl": 0.19373086094856262, "learning_rate": 3.93453872629414e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5583 }, { "completion_length": 694.5, "epoch": 1.5476718403547673, "grad_norm": 0.0, "kl": 0.19085276126861572, "learning_rate": 3.934180159704746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5584 }, { "completion_length": 687.0, "epoch": 1.5479490022172948, "grad_norm": 0.0, "kl": 0.16553020477294922, "learning_rate": 3.933821549135012e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5585 }, { "completion_length": 707.0, "epoch": 1.5482261640798227, "grad_norm": 0.412445068359375, "kl": 0.18266989290714264, "learning_rate": 3.933462894595936e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5586 }, { "completion_length": 635.75, "epoch": 1.5485033259423502, "grad_norm": 0.0, "kl": 0.17608395218849182, "learning_rate": 3.933104196098516e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5587 }, { "completion_length": 631.5, "epoch": 1.548780487804878, "grad_norm": 0.0, "kl": 0.2045646607875824, "learning_rate": 3.932745453653752e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5588 }, { "completion_length": 694.75, "epoch": 1.5490576496674058, "grad_norm": 0.0, "kl": 0.20242834091186523, "learning_rate": 3.932386667272645e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5589 }, { "completion_length": 602.25, "epoch": 1.5493348115299335, "grad_norm": 0.0, "kl": 0.20796562731266022, "learning_rate": 3.932027836966197e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5590 }, { "completion_length": 704.0, "epoch": 1.5496119733924612, "grad_norm": 0.44526058435440063, "kl": 17.27486228942871, "learning_rate": 3.931668962745413e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5591 }, { "completion_length": 662.0, "epoch": 1.5498891352549888, "grad_norm": 0.4073103070259094, "kl": 0.17129313945770264, "learning_rate": 3.931310044621297e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5592 }, { "completion_length": 743.0, "epoch": 1.5501662971175167, "grad_norm": 0.0, "kl": 6.78053092956543, "learning_rate": 3.9309510826048556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5593 }, { "completion_length": 707.25, "epoch": 1.5504434589800442, "grad_norm": 0.0, "kl": 0.1596636027097702, "learning_rate": 3.930592076707099e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5594 }, { "completion_length": 711.75, "epoch": 1.5507206208425721, "grad_norm": 0.0, "kl": 0.1671547293663025, "learning_rate": 3.930233026939033e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5595 }, { "completion_length": 707.0, "epoch": 1.5509977827050998, "grad_norm": 0.0, "kl": 0.1897270530462265, "learning_rate": 3.92987393331167e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5596 }, { "completion_length": 724.25, "epoch": 1.5512749445676275, "grad_norm": 0.5510002970695496, "kl": 12.444069862365723, "learning_rate": 3.929514795836021e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5597 }, { "completion_length": 614.0, "epoch": 1.5515521064301552, "grad_norm": 0.5526188015937805, "kl": 1.66478431224823, "learning_rate": 3.9291556145231e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5598 }, { "completion_length": 797.25, "epoch": 1.5518292682926829, "grad_norm": 0.0, "kl": 0.17136390507221222, "learning_rate": 3.928796389383923e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5599 }, { "completion_length": 664.0, "epoch": 1.5521064301552108, "grad_norm": 0.36083149909973145, "kl": 0.7329142689704895, "learning_rate": 3.928437120429503e-06, "loss": 0.0, "reward": 1.375, "reward_std": 0.75, "rewards/confident_score_func": -0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 5600 }, { "completion_length": 674.75, "epoch": 1.5523835920177382, "grad_norm": 0.0, "kl": 0.1811513751745224, "learning_rate": 3.928077807670858e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5601 }, { "completion_length": 626.5, "epoch": 1.5526607538802661, "grad_norm": 0.0, "kl": 0.1739104837179184, "learning_rate": 3.927718451119009e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5602 }, { "completion_length": 600.0, "epoch": 1.5529379157427938, "grad_norm": 0.0, "kl": 0.23863427340984344, "learning_rate": 3.9273590507849736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5603 }, { "completion_length": 766.5, "epoch": 1.5532150776053215, "grad_norm": 0.0, "kl": 0.32771167159080505, "learning_rate": 3.926999606679773e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5604 }, { "completion_length": 625.5, "epoch": 1.5534922394678492, "grad_norm": 0.5434783101081848, "kl": 81.47868347167969, "learning_rate": 3.926640118814431e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5605 }, { "completion_length": 598.75, "epoch": 1.5537694013303769, "grad_norm": 0.0, "kl": 0.18646946549415588, "learning_rate": 3.926280587199972e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5606 }, { "completion_length": 569.0, "epoch": 1.5540465631929048, "grad_norm": 0.0, "kl": 1.2734800577163696, "learning_rate": 3.92592101184742e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5607 }, { "completion_length": 716.0, "epoch": 1.5543237250554323, "grad_norm": 0.0, "kl": 0.2357020527124405, "learning_rate": 3.925561392767802e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5608 }, { "completion_length": 652.25, "epoch": 1.5546008869179602, "grad_norm": 0.0, "kl": 0.19808994233608246, "learning_rate": 3.9252017299721464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5609 }, { "completion_length": 642.5, "epoch": 1.5548780487804879, "grad_norm": 0.0, "kl": 1.1269224882125854, "learning_rate": 3.924842023471484e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5610 }, { "completion_length": 739.75, "epoch": 1.5551552106430155, "grad_norm": 0.0, "kl": 0.16409872472286224, "learning_rate": 3.9244822732768415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5611 }, { "completion_length": 613.5, "epoch": 1.5554323725055432, "grad_norm": 0.0, "kl": 1.1770226955413818, "learning_rate": 3.924122479399255e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5612 }, { "completion_length": 567.25, "epoch": 1.555709534368071, "grad_norm": 0.0, "kl": 0.18865866959095, "learning_rate": 3.923762641849755e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5613 }, { "completion_length": 576.0, "epoch": 1.5559866962305988, "grad_norm": 0.0, "kl": 0.1788136065006256, "learning_rate": 3.923402760639379e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5614 }, { "completion_length": 580.75, "epoch": 1.5562638580931263, "grad_norm": 0.0, "kl": 0.1859707534313202, "learning_rate": 3.92304283577916e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5615 }, { "completion_length": 647.0, "epoch": 1.5565410199556542, "grad_norm": 0.0, "kl": 0.16213083267211914, "learning_rate": 3.922682867280138e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5616 }, { "completion_length": 636.75, "epoch": 1.5568181818181817, "grad_norm": 0.0, "kl": 0.25931262969970703, "learning_rate": 3.9223228551533495e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5617 }, { "completion_length": 668.0, "epoch": 1.5570953436807096, "grad_norm": 0.0, "kl": 0.21938176453113556, "learning_rate": 3.921962799409838e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5618 }, { "completion_length": 644.5, "epoch": 1.5573725055432373, "grad_norm": 0.0, "kl": 0.3035065531730652, "learning_rate": 3.921602700060641e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5619 }, { "completion_length": 733.75, "epoch": 1.557649667405765, "grad_norm": 0.0, "kl": 0.16983027756214142, "learning_rate": 3.921242557116802e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5620 }, { "completion_length": 705.25, "epoch": 1.5579268292682928, "grad_norm": 0.0, "kl": 0.14530542492866516, "learning_rate": 3.9208823705893674e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5621 }, { "completion_length": 602.75, "epoch": 1.5582039911308203, "grad_norm": 0.0, "kl": 0.24075284600257874, "learning_rate": 3.920522140489381e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5622 }, { "completion_length": 676.0, "epoch": 1.5584811529933482, "grad_norm": 0.0, "kl": 0.1605144888162613, "learning_rate": 3.92016186682789e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5623 }, { "completion_length": 568.5, "epoch": 1.5587583148558757, "grad_norm": 0.0, "kl": 0.2107573002576828, "learning_rate": 3.919801549615942e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5624 }, { "completion_length": 590.5, "epoch": 1.5590354767184036, "grad_norm": 0.0, "kl": 0.192813903093338, "learning_rate": 3.919441188864587e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5625 }, { "completion_length": 766.0, "epoch": 1.5593126385809313, "grad_norm": 0.0, "kl": 0.24839872121810913, "learning_rate": 3.919080784584875e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5626 }, { "completion_length": 710.5, "epoch": 1.559589800443459, "grad_norm": 0.0, "kl": 0.16931918263435364, "learning_rate": 3.918720336787859e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5627 }, { "completion_length": 709.0, "epoch": 1.5598669623059866, "grad_norm": 0.46946898102760315, "kl": 1.5770139694213867, "learning_rate": 3.918359845484591e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5628 }, { "completion_length": 530.0, "epoch": 1.5601441241685143, "grad_norm": 0.0, "kl": 7.478653907775879, "learning_rate": 3.917999310686128e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5629 }, { "completion_length": 588.0, "epoch": 1.5604212860310422, "grad_norm": 0.0, "kl": 0.17335858941078186, "learning_rate": 3.917638732403524e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5630 }, { "completion_length": 728.0, "epoch": 1.5606984478935697, "grad_norm": 0.0, "kl": 0.3648698627948761, "learning_rate": 3.917278110647837e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5631 }, { "completion_length": 675.0, "epoch": 1.5609756097560976, "grad_norm": 0.0, "kl": 0.1750655323266983, "learning_rate": 3.916917445430127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5632 }, { "completion_length": 782.75, "epoch": 1.5612527716186253, "grad_norm": 0.4432072937488556, "kl": 4.568885803222656, "learning_rate": 3.9165567367614535e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5633 }, { "completion_length": 648.25, "epoch": 1.561529933481153, "grad_norm": 0.0, "kl": 0.18036723136901855, "learning_rate": 3.9161959846528775e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5634 }, { "completion_length": 581.5, "epoch": 1.5618070953436807, "grad_norm": 0.0, "kl": 0.18341714143753052, "learning_rate": 3.915835189115461e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5635 }, { "completion_length": 689.0, "epoch": 1.5620842572062084, "grad_norm": 0.0, "kl": 0.2769691050052643, "learning_rate": 3.91547435016027e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5636 }, { "completion_length": 660.25, "epoch": 1.5623614190687363, "grad_norm": 0.0, "kl": 0.20867937803268433, "learning_rate": 3.915113467798367e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5637 }, { "completion_length": 650.25, "epoch": 1.5626385809312637, "grad_norm": 0.0, "kl": 0.1575072705745697, "learning_rate": 3.914752542040823e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5638 }, { "completion_length": 691.5, "epoch": 1.5629157427937916, "grad_norm": 0.43364790081977844, "kl": 0.240375354886055, "learning_rate": 3.9143915728987025e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5639 }, { "completion_length": 634.0, "epoch": 1.5631929046563193, "grad_norm": 0.0, "kl": 0.19570866227149963, "learning_rate": 3.914030560383077e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5640 }, { "completion_length": 628.75, "epoch": 1.563470066518847, "grad_norm": 0.0, "kl": 2.356261730194092, "learning_rate": 3.913669504505015e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5641 }, { "completion_length": 653.25, "epoch": 1.5637472283813747, "grad_norm": 0.0, "kl": 4.071694374084473, "learning_rate": 3.913308405275591e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5642 }, { "completion_length": 578.25, "epoch": 1.5640243902439024, "grad_norm": 0.0, "kl": 0.2041882872581482, "learning_rate": 3.912947262705877e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5643 }, { "completion_length": 566.5, "epoch": 1.5643015521064303, "grad_norm": 0.0, "kl": 0.17846529185771942, "learning_rate": 3.912586076806949e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5644 }, { "completion_length": 641.0, "epoch": 1.5645787139689578, "grad_norm": 0.39478081464767456, "kl": 0.1732311099767685, "learning_rate": 3.912224847589882e-06, "loss": 0.0, "reward": 4.625, "reward_std": 2.25, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5645 }, { "completion_length": 663.75, "epoch": 1.5648558758314857, "grad_norm": 0.0, "kl": 0.3585583567619324, "learning_rate": 3.911863575065753e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5646 }, { "completion_length": 671.75, "epoch": 1.5651330376940134, "grad_norm": 0.0, "kl": 0.32039397954940796, "learning_rate": 3.911502259245642e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5647 }, { "completion_length": 608.0, "epoch": 1.565410199556541, "grad_norm": 0.0, "kl": 0.22230760753154755, "learning_rate": 3.911140900140627e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5648 }, { "completion_length": 573.25, "epoch": 1.5656873614190687, "grad_norm": 0.0, "kl": 0.23678873479366302, "learning_rate": 3.910779497761793e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5649 }, { "completion_length": 635.25, "epoch": 1.5659645232815964, "grad_norm": 0.573814332485199, "kl": 0.38245201110839844, "learning_rate": 3.910418052120219e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5650 }, { "completion_length": 759.25, "epoch": 1.5662416851441243, "grad_norm": 0.0, "kl": 0.17085540294647217, "learning_rate": 3.910056563226991e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5651 }, { "completion_length": 640.5, "epoch": 1.5665188470066518, "grad_norm": 0.0, "kl": 0.22678348422050476, "learning_rate": 3.909695031093193e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5652 }, { "completion_length": 684.75, "epoch": 1.5667960088691797, "grad_norm": 0.5614250302314758, "kl": 0.1818414032459259, "learning_rate": 3.909333455729914e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5653 }, { "completion_length": 640.0, "epoch": 1.5670731707317072, "grad_norm": 0.0, "kl": 0.44077548384666443, "learning_rate": 3.90897183714824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5654 }, { "completion_length": 605.75, "epoch": 1.567350332594235, "grad_norm": 0.0, "kl": 0.3123577833175659, "learning_rate": 3.908610175359261e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5655 }, { "completion_length": 784.25, "epoch": 1.5676274944567627, "grad_norm": 0.5418803095817566, "kl": 0.21207685768604279, "learning_rate": 3.9082484703740675e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5656 }, { "completion_length": 720.5, "epoch": 1.5679046563192904, "grad_norm": 0.0, "kl": 0.24448038637638092, "learning_rate": 3.907886722203752e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5657 }, { "completion_length": 710.0, "epoch": 1.5681818181818183, "grad_norm": 0.0, "kl": 0.17919574677944183, "learning_rate": 3.907524930859407e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5658 }, { "completion_length": 729.5, "epoch": 1.5684589800443458, "grad_norm": 0.0, "kl": 0.17421644926071167, "learning_rate": 3.907163096352127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5659 }, { "completion_length": 691.5, "epoch": 1.5687361419068737, "grad_norm": 0.0, "kl": 0.23940420150756836, "learning_rate": 3.90680121869301e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5660 }, { "completion_length": 639.75, "epoch": 1.5690133037694012, "grad_norm": 0.0, "kl": 0.18506738543510437, "learning_rate": 3.906439297893151e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5661 }, { "completion_length": 726.5, "epoch": 1.569290465631929, "grad_norm": 0.49285951256752014, "kl": 0.6828257441520691, "learning_rate": 3.906077333963648e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5662 }, { "completion_length": 629.5, "epoch": 1.5695676274944568, "grad_norm": 0.0, "kl": 0.18427975475788116, "learning_rate": 3.905715326915604e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5663 }, { "completion_length": 695.0, "epoch": 1.5698447893569845, "grad_norm": 0.0, "kl": 6.430425643920898, "learning_rate": 3.9053532767601175e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5664 }, { "completion_length": 605.5, "epoch": 1.5701219512195121, "grad_norm": 0.6881683468818665, "kl": 11.65971851348877, "learning_rate": 3.904991183508293e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5665 }, { "completion_length": 752.75, "epoch": 1.5703991130820398, "grad_norm": 0.0, "kl": 0.14858777821063995, "learning_rate": 3.9046290471712335e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5666 }, { "completion_length": 660.75, "epoch": 1.5706762749445677, "grad_norm": 0.4297894537448883, "kl": 0.605490505695343, "learning_rate": 3.904266867760044e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5667 }, { "completion_length": 646.25, "epoch": 1.5709534368070952, "grad_norm": 0.0, "kl": 0.5373188257217407, "learning_rate": 3.903904645285831e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5668 }, { "completion_length": 645.75, "epoch": 1.5712305986696231, "grad_norm": 0.0, "kl": 0.20581115782260895, "learning_rate": 3.903542379759703e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5669 }, { "completion_length": 681.25, "epoch": 1.5715077605321508, "grad_norm": 0.0, "kl": 0.3484470248222351, "learning_rate": 3.903180071192768e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5670 }, { "completion_length": 649.25, "epoch": 1.5717849223946785, "grad_norm": 0.0, "kl": 0.2132457047700882, "learning_rate": 3.902817719596138e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5671 }, { "completion_length": 645.0, "epoch": 1.5720620842572062, "grad_norm": 0.0, "kl": 0.2323303073644638, "learning_rate": 3.902455324980925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5672 }, { "completion_length": 636.0, "epoch": 1.5723392461197339, "grad_norm": 0.0, "kl": 0.17696914076805115, "learning_rate": 3.9020928873582395e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5673 }, { "completion_length": 724.25, "epoch": 1.5726164079822618, "grad_norm": 0.0, "kl": 0.2294234037399292, "learning_rate": 3.901730406739198e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5674 }, { "completion_length": 665.0, "epoch": 1.5728935698447892, "grad_norm": 0.0, "kl": 0.18962430953979492, "learning_rate": 3.901367883134916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5675 }, { "completion_length": 684.25, "epoch": 1.5731707317073171, "grad_norm": 0.0, "kl": 0.16891327500343323, "learning_rate": 3.901005316556512e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5676 }, { "completion_length": 583.0, "epoch": 1.5734478935698448, "grad_norm": 0.0, "kl": 0.4580535590648651, "learning_rate": 3.900642707015102e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5677 }, { "completion_length": 747.0, "epoch": 1.5737250554323725, "grad_norm": 0.0, "kl": 0.17155136168003082, "learning_rate": 3.900280054521807e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5678 }, { "completion_length": 744.75, "epoch": 1.5740022172949002, "grad_norm": 0.0, "kl": 6384.23095703125, "learning_rate": 3.899917359087747e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5679 }, { "completion_length": 693.25, "epoch": 1.5742793791574279, "grad_norm": 0.0, "kl": 0.1861504763364792, "learning_rate": 3.8995546207240455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5680 }, { "completion_length": 692.0, "epoch": 1.5745565410199558, "grad_norm": 0.6460914611816406, "kl": 5515.41455078125, "learning_rate": 3.899191839441826e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5681 }, { "completion_length": 590.75, "epoch": 1.5748337028824833, "grad_norm": 0.0, "kl": 0.1989145427942276, "learning_rate": 3.898829015252213e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5682 }, { "completion_length": 632.25, "epoch": 1.5751108647450112, "grad_norm": 0.0, "kl": 0.23067516088485718, "learning_rate": 3.898466148166333e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5683 }, { "completion_length": 677.75, "epoch": 1.5753880266075388, "grad_norm": 0.0, "kl": 0.23202979564666748, "learning_rate": 3.898103238195314e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5684 }, { "completion_length": 596.75, "epoch": 1.5756651884700665, "grad_norm": 0.6965190768241882, "kl": 1099.48681640625, "learning_rate": 3.897740285350286e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5685 }, { "completion_length": 699.75, "epoch": 1.5759423503325942, "grad_norm": 0.0, "kl": 0.1955593228340149, "learning_rate": 3.897377289642376e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5686 }, { "completion_length": 871.0, "epoch": 1.576219512195122, "grad_norm": 0.0, "kl": 0.19229485094547272, "learning_rate": 3.897014251082718e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5687 }, { "completion_length": 654.5, "epoch": 1.5764966740576498, "grad_norm": 0.0, "kl": 0.20433007180690765, "learning_rate": 3.896651169682444e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5688 }, { "completion_length": 676.75, "epoch": 1.5767738359201773, "grad_norm": 1.0047858953475952, "kl": 0.4291810691356659, "learning_rate": 3.896288045452688e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5689 }, { "completion_length": 720.0, "epoch": 1.5770509977827052, "grad_norm": 0.4023439586162567, "kl": 217.702392578125, "learning_rate": 3.895924878404588e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5690 }, { "completion_length": 798.5, "epoch": 1.5773281596452327, "grad_norm": 0.0, "kl": 0.3832925856113434, "learning_rate": 3.895561668549278e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5691 }, { "completion_length": 714.0, "epoch": 1.5776053215077606, "grad_norm": 0.0, "kl": 0.19948746263980865, "learning_rate": 3.895198415897896e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5692 }, { "completion_length": 706.0, "epoch": 1.5778824833702882, "grad_norm": 0.0, "kl": 0.18394021689891815, "learning_rate": 3.8948351204615846e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5693 }, { "completion_length": 600.75, "epoch": 1.578159645232816, "grad_norm": 0.0, "kl": 0.18926526606082916, "learning_rate": 3.894471782251481e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5694 }, { "completion_length": 789.75, "epoch": 1.5784368070953438, "grad_norm": 0.0, "kl": 0.16304698586463928, "learning_rate": 3.894108401278728e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5695 }, { "completion_length": 665.75, "epoch": 1.5787139689578713, "grad_norm": 0.0, "kl": 38.02091979980469, "learning_rate": 3.893744977554471e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5696 }, { "completion_length": 698.5, "epoch": 1.5789911308203992, "grad_norm": 0.0, "kl": 0.18073973059654236, "learning_rate": 3.893381511089852e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5697 }, { "completion_length": 612.75, "epoch": 1.5792682926829267, "grad_norm": 0.3953545093536377, "kl": 0.1629689633846283, "learning_rate": 3.89301800189602e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5698 }, { "completion_length": 683.25, "epoch": 1.5795454545454546, "grad_norm": 0.6395158171653748, "kl": 0.16275829076766968, "learning_rate": 3.8926544499841196e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5699 }, { "completion_length": 629.5, "epoch": 1.5798226164079823, "grad_norm": 0.0, "kl": 0.1784195899963379, "learning_rate": 3.892290855365301e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5700 }, { "completion_length": 629.5, "epoch": 1.58009977827051, "grad_norm": 0.0, "kl": 0.1941932886838913, "learning_rate": 3.891927218050714e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5701 }, { "completion_length": 598.25, "epoch": 1.5803769401330376, "grad_norm": 0.0, "kl": 0.18143780529499054, "learning_rate": 3.891563538051508e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5702 }, { "completion_length": 686.0, "epoch": 1.5806541019955653, "grad_norm": 0.4988251030445099, "kl": 0.1871829330921173, "learning_rate": 3.891199815378839e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5703 }, { "completion_length": 742.0, "epoch": 1.5809312638580932, "grad_norm": 0.0, "kl": 0.18034540116786957, "learning_rate": 3.890836050043857e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5704 }, { "completion_length": 656.25, "epoch": 1.5812084257206207, "grad_norm": 0.0, "kl": 0.1719006896018982, "learning_rate": 3.89047224205772e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5705 }, { "completion_length": 672.0, "epoch": 1.5814855875831486, "grad_norm": 0.0, "kl": 0.18047992885112762, "learning_rate": 3.890108391431584e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5706 }, { "completion_length": 682.5, "epoch": 1.5817627494456763, "grad_norm": 0.0, "kl": 0.20545995235443115, "learning_rate": 3.889744498176606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5707 }, { "completion_length": 650.75, "epoch": 1.582039911308204, "grad_norm": 0.0, "kl": 0.1864704191684723, "learning_rate": 3.889380562303946e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5708 }, { "completion_length": 561.5, "epoch": 1.5823170731707317, "grad_norm": 0.0, "kl": 0.2052014321088791, "learning_rate": 3.889016583824763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5709 }, { "completion_length": 613.0, "epoch": 1.5825942350332594, "grad_norm": 0.0, "kl": 0.2467048168182373, "learning_rate": 3.88865256275022e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5710 }, { "completion_length": 641.5, "epoch": 1.5828713968957873, "grad_norm": 0.0, "kl": 0.23176191747188568, "learning_rate": 3.888288499091479e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5711 }, { "completion_length": 705.75, "epoch": 1.5831485587583147, "grad_norm": 0.0, "kl": 0.4682733118534088, "learning_rate": 3.887924392859705e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5712 }, { "completion_length": 671.75, "epoch": 1.5834257206208426, "grad_norm": 0.0, "kl": 12.616329193115234, "learning_rate": 3.8875602440660635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5713 }, { "completion_length": 586.75, "epoch": 1.5837028824833703, "grad_norm": 0.6617813110351562, "kl": 1884.5914306640625, "learning_rate": 3.8871960527217215e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5714 }, { "completion_length": 661.5, "epoch": 1.583980044345898, "grad_norm": 0.9253402352333069, "kl": 1.7941397428512573, "learning_rate": 3.8868318188378475e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5715 }, { "completion_length": 632.75, "epoch": 1.5842572062084257, "grad_norm": 0.0, "kl": 70.05265808105469, "learning_rate": 3.8864675424256105e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5716 }, { "completion_length": 598.25, "epoch": 1.5845343680709534, "grad_norm": 0.9314500093460083, "kl": 0.8526790142059326, "learning_rate": 3.886103223496181e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5717 }, { "completion_length": 579.25, "epoch": 1.5848115299334813, "grad_norm": 0.0, "kl": 0.3641619086265564, "learning_rate": 3.885738862060733e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5718 }, { "completion_length": 614.75, "epoch": 1.5850886917960088, "grad_norm": 0.0, "kl": 0.16302232444286346, "learning_rate": 3.8853744581304376e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5719 }, { "completion_length": 704.75, "epoch": 1.5853658536585367, "grad_norm": 0.0, "kl": 0.20741486549377441, "learning_rate": 3.8850100117164705e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5720 }, { "completion_length": 656.25, "epoch": 1.5856430155210643, "grad_norm": 0.0, "kl": 0.18029338121414185, "learning_rate": 3.884645522830008e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5721 }, { "completion_length": 712.5, "epoch": 1.585920177383592, "grad_norm": 0.0, "kl": 0.17175856232643127, "learning_rate": 3.884280991482227e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5722 }, { "completion_length": 1133.75, "epoch": 1.5861973392461197, "grad_norm": 0.0, "kl": 0.13674427568912506, "learning_rate": 3.883916417684308e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5723 }, { "completion_length": 682.5, "epoch": 1.5864745011086474, "grad_norm": 0.0, "kl": 0.23593631386756897, "learning_rate": 3.8835518014474286e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5724 }, { "completion_length": 706.0, "epoch": 1.5867516629711753, "grad_norm": 0.0, "kl": 0.18945881724357605, "learning_rate": 3.88318714278277e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5725 }, { "completion_length": 745.25, "epoch": 1.5870288248337028, "grad_norm": 0.0, "kl": 0.14947724342346191, "learning_rate": 3.882822441701516e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5726 }, { "completion_length": 772.25, "epoch": 1.5873059866962307, "grad_norm": 0.36535337567329407, "kl": 0.14412710070610046, "learning_rate": 3.88245769821485e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5727 }, { "completion_length": 606.25, "epoch": 1.5875831485587582, "grad_norm": 0.0, "kl": 0.19244007766246796, "learning_rate": 3.882092912333958e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5728 }, { "completion_length": 734.25, "epoch": 1.587860310421286, "grad_norm": 0.0, "kl": 0.1815323680639267, "learning_rate": 3.881728084070025e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5729 }, { "completion_length": 698.5, "epoch": 1.5881374722838137, "grad_norm": 0.43390777707099915, "kl": 0.15013228356838226, "learning_rate": 3.88136321343424e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5730 }, { "completion_length": 714.25, "epoch": 1.5884146341463414, "grad_norm": 0.0, "kl": 0.15610718727111816, "learning_rate": 3.8809983004377915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5731 }, { "completion_length": 786.0, "epoch": 1.5886917960088693, "grad_norm": 0.38922804594039917, "kl": 3643.71923828125, "learning_rate": 3.88063334509187e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5732 }, { "completion_length": 723.5, "epoch": 1.5889689578713968, "grad_norm": 0.0, "kl": 1.4758117198944092, "learning_rate": 3.880268347407667e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5733 }, { "completion_length": 571.25, "epoch": 1.5892461197339247, "grad_norm": 0.0, "kl": 11.857400894165039, "learning_rate": 3.879903307396375e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5734 }, { "completion_length": 676.0, "epoch": 1.5895232815964522, "grad_norm": 0.0, "kl": 0.7541184425354004, "learning_rate": 3.879538225069189e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5735 }, { "completion_length": 719.5, "epoch": 1.58980044345898, "grad_norm": 0.0, "kl": 0.1686319261789322, "learning_rate": 3.879173100437306e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5736 }, { "completion_length": 571.75, "epoch": 1.5900776053215078, "grad_norm": 0.578767716884613, "kl": 8396.1259765625, "learning_rate": 3.8788079335119195e-06, "loss": -0.0, "reward": 4.09375, "reward_std": 3.3125, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 5737 }, { "completion_length": 710.75, "epoch": 1.5903547671840355, "grad_norm": 0.43676209449768066, "kl": 4.559342384338379, "learning_rate": 3.87844272430423e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5738 }, { "completion_length": 651.75, "epoch": 1.5906319290465631, "grad_norm": 0.0, "kl": 0.5092099905014038, "learning_rate": 3.878077472825436e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5739 }, { "completion_length": 621.0, "epoch": 1.5909090909090908, "grad_norm": 0.0, "kl": 0.18415243923664093, "learning_rate": 3.877712179086739e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5740 }, { "completion_length": 720.5, "epoch": 1.5911862527716187, "grad_norm": 0.0, "kl": 0.16385464370250702, "learning_rate": 3.87734684309934e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5741 }, { "completion_length": 781.75, "epoch": 1.5914634146341462, "grad_norm": 0.39459991455078125, "kl": 0.12947116792201996, "learning_rate": 3.876981464874443e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5742 }, { "completion_length": 708.5, "epoch": 1.591740576496674, "grad_norm": 0.0, "kl": 0.22334632277488708, "learning_rate": 3.876616044423253e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5743 }, { "completion_length": 717.5, "epoch": 1.5920177383592018, "grad_norm": 0.0, "kl": 0.3092197775840759, "learning_rate": 3.876250581756975e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5744 }, { "completion_length": 719.5, "epoch": 1.5922949002217295, "grad_norm": 0.0, "kl": 0.5065204501152039, "learning_rate": 3.875885076886817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5745 }, { "completion_length": 702.75, "epoch": 1.5925720620842572, "grad_norm": 0.4323088228702545, "kl": 80.76045989990234, "learning_rate": 3.875519529823988e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5746 }, { "completion_length": 634.75, "epoch": 1.5928492239467849, "grad_norm": 0.0, "kl": 0.17323103547096252, "learning_rate": 3.875153940579696e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5747 }, { "completion_length": 667.25, "epoch": 1.5931263858093128, "grad_norm": 0.0, "kl": 0.1984056532382965, "learning_rate": 3.874788309165153e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5748 }, { "completion_length": 706.0, "epoch": 1.5934035476718402, "grad_norm": 0.0, "kl": 0.24678003787994385, "learning_rate": 3.874422635591573e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5749 }, { "completion_length": 588.5, "epoch": 1.5936807095343681, "grad_norm": 0.0, "kl": 0.25480470061302185, "learning_rate": 3.874056919870166e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5750 }, { "completion_length": 711.5, "epoch": 1.5939578713968958, "grad_norm": 0.0, "kl": 151220.859375, "learning_rate": 3.873691162012151e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5751 }, { "completion_length": 643.0, "epoch": 1.5942350332594235, "grad_norm": 0.0, "kl": 0.16580219566822052, "learning_rate": 3.873325362028742e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5752 }, { "completion_length": 733.5, "epoch": 1.5945121951219512, "grad_norm": 0.0, "kl": 0.20116950571537018, "learning_rate": 3.872959519931156e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5753 }, { "completion_length": 921.0, "epoch": 1.5947893569844789, "grad_norm": 0.0, "kl": 0.14235252141952515, "learning_rate": 3.872593635730615e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5754 }, { "completion_length": 685.5, "epoch": 1.5950665188470068, "grad_norm": 0.0, "kl": 0.23114022612571716, "learning_rate": 3.872227709438336e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5755 }, { "completion_length": 732.5, "epoch": 1.5953436807095343, "grad_norm": 0.35854995250701904, "kl": 2973.902099609375, "learning_rate": 3.871861741065541e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5756 }, { "completion_length": 703.0, "epoch": 1.5956208425720622, "grad_norm": 0.0, "kl": 0.18998350203037262, "learning_rate": 3.871495730623453e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5757 }, { "completion_length": 584.5, "epoch": 1.5958980044345898, "grad_norm": 0.0, "kl": 0.17800678312778473, "learning_rate": 3.871129678123297e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5758 }, { "completion_length": 626.75, "epoch": 1.5961751662971175, "grad_norm": 0.4609983563423157, "kl": 5310.935546875, "learning_rate": 3.8707635835762975e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5759 }, { "completion_length": 681.0, "epoch": 1.5964523281596452, "grad_norm": 0.0, "kl": 0.20214533805847168, "learning_rate": 3.870397446993681e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5760 }, { "completion_length": 630.5, "epoch": 1.596729490022173, "grad_norm": 0.49963054060935974, "kl": 0.1644597351551056, "learning_rate": 3.870031268386676e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5761 }, { "completion_length": 843.25, "epoch": 1.5970066518847008, "grad_norm": 0.31574270129203796, "kl": 0.31464216113090515, "learning_rate": 3.86966504776651e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5762 }, { "completion_length": 624.75, "epoch": 1.5972838137472283, "grad_norm": 0.0, "kl": 0.3526344895362854, "learning_rate": 3.869298785144415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5763 }, { "completion_length": 667.5, "epoch": 1.5975609756097562, "grad_norm": 0.0, "kl": 0.19027350842952728, "learning_rate": 3.868932480531623e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5764 }, { "completion_length": 626.25, "epoch": 1.5978381374722836, "grad_norm": 0.0, "kl": 0.186077281832695, "learning_rate": 3.868566133939366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5765 }, { "completion_length": 693.5, "epoch": 1.5981152993348116, "grad_norm": 0.0, "kl": 0.17897997796535492, "learning_rate": 3.868199745378879e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5766 }, { "completion_length": 629.25, "epoch": 1.5983924611973392, "grad_norm": 0.0, "kl": 0.3480371832847595, "learning_rate": 3.867833314861397e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5767 }, { "completion_length": 679.75, "epoch": 1.598669623059867, "grad_norm": 0.0, "kl": 0.16263850033283234, "learning_rate": 3.867466842398158e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5768 }, { "completion_length": 704.5, "epoch": 1.5989467849223948, "grad_norm": 0.0, "kl": 0.15656337141990662, "learning_rate": 3.867100328000399e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5769 }, { "completion_length": 643.25, "epoch": 1.5992239467849223, "grad_norm": 0.0, "kl": 0.24562205374240875, "learning_rate": 3.86673377167936e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5770 }, { "completion_length": 737.75, "epoch": 1.5995011086474502, "grad_norm": 0.0, "kl": 0.14907944202423096, "learning_rate": 3.866367173446281e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5771 }, { "completion_length": 685.5, "epoch": 1.5997782705099777, "grad_norm": 0.0, "kl": 0.23148460686206818, "learning_rate": 3.866000533312405e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5772 }, { "completion_length": 742.5, "epoch": 1.6000554323725056, "grad_norm": 0.0, "kl": 0.17010194063186646, "learning_rate": 3.865633851288975e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5773 }, { "completion_length": 611.0, "epoch": 1.6003325942350333, "grad_norm": 0.0, "kl": 0.2667599022388458, "learning_rate": 3.865267127387237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5774 }, { "completion_length": 650.25, "epoch": 1.600609756097561, "grad_norm": 0.0, "kl": 0.25029414892196655, "learning_rate": 3.864900361618435e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5775 }, { "completion_length": 615.25, "epoch": 1.6008869179600886, "grad_norm": 0.0, "kl": 0.2170053869485855, "learning_rate": 3.864533553993817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5776 }, { "completion_length": 693.75, "epoch": 1.6011640798226163, "grad_norm": 0.0, "kl": 3.64493465423584, "learning_rate": 3.86416670452463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5777 }, { "completion_length": 778.25, "epoch": 1.6014412416851442, "grad_norm": 0.0, "kl": 0.21437914669513702, "learning_rate": 3.863799813222125e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5778 }, { "completion_length": 630.5, "epoch": 1.6017184035476717, "grad_norm": 0.0, "kl": 0.4640812575817108, "learning_rate": 3.863432880097554e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5779 }, { "completion_length": 787.75, "epoch": 1.6019955654101996, "grad_norm": 0.5846437811851501, "kl": 0.1792820245027542, "learning_rate": 3.8630659051621685e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5780 }, { "completion_length": 710.25, "epoch": 1.6022727272727273, "grad_norm": 0.0, "kl": 0.18115368485450745, "learning_rate": 3.862698888427221e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5781 }, { "completion_length": 635.0, "epoch": 1.602549889135255, "grad_norm": 0.0, "kl": 0.41541483998298645, "learning_rate": 3.8623318299039675e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5782 }, { "completion_length": 661.5, "epoch": 1.6028270509977827, "grad_norm": 0.0, "kl": 0.2097100168466568, "learning_rate": 3.861964729603664e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5783 }, { "completion_length": 790.5, "epoch": 1.6031042128603104, "grad_norm": 0.0, "kl": 0.3057890236377716, "learning_rate": 3.861597587537568e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5784 }, { "completion_length": 728.0, "epoch": 1.6033813747228383, "grad_norm": 0.0, "kl": 0.18230170011520386, "learning_rate": 3.861230403716938e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5785 }, { "completion_length": 705.75, "epoch": 1.6036585365853657, "grad_norm": 0.0, "kl": 0.2586725950241089, "learning_rate": 3.860863178153034e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5786 }, { "completion_length": 717.75, "epoch": 1.6039356984478936, "grad_norm": 0.0, "kl": 0.17261803150177002, "learning_rate": 3.860495910857117e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5787 }, { "completion_length": 741.75, "epoch": 1.6042128603104213, "grad_norm": 0.0, "kl": 0.17479361593723297, "learning_rate": 3.860128601840451e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5788 }, { "completion_length": 639.0, "epoch": 1.604490022172949, "grad_norm": 0.0, "kl": 0.1977207064628601, "learning_rate": 3.859761251114299e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5789 }, { "completion_length": 658.75, "epoch": 1.6047671840354767, "grad_norm": 0.0, "kl": 0.20529061555862427, "learning_rate": 3.859393858689925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5790 }, { "completion_length": 828.75, "epoch": 1.6050443458980044, "grad_norm": 0.0, "kl": 34.99321746826172, "learning_rate": 3.8590264245785955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5791 }, { "completion_length": 689.0, "epoch": 1.6053215077605323, "grad_norm": 0.0, "kl": 0.2195303738117218, "learning_rate": 3.858658948791581e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5792 }, { "completion_length": 719.5, "epoch": 1.6055986696230597, "grad_norm": 0.39398571848869324, "kl": 10117.16015625, "learning_rate": 3.858291431340146e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5793 }, { "completion_length": 625.25, "epoch": 1.6058758314855877, "grad_norm": 0.0, "kl": 0.1538713127374649, "learning_rate": 3.857923872235565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5794 }, { "completion_length": 640.75, "epoch": 1.6061529933481153, "grad_norm": 0.6949355006217957, "kl": 13.484936714172363, "learning_rate": 3.857556271489107e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5795 }, { "completion_length": 752.5, "epoch": 1.606430155210643, "grad_norm": 0.0, "kl": 0.20923388004302979, "learning_rate": 3.857188629112045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5796 }, { "completion_length": 641.0, "epoch": 1.6067073170731707, "grad_norm": 0.5367984175682068, "kl": 3.162219762802124, "learning_rate": 3.856820945115655e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5797 }, { "completion_length": 715.5, "epoch": 1.6069844789356984, "grad_norm": 0.0, "kl": 0.29241806268692017, "learning_rate": 3.856453219511209e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5798 }, { "completion_length": 719.0, "epoch": 1.6072616407982263, "grad_norm": 0.0, "kl": 0.2631036341190338, "learning_rate": 3.8560854523099865e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5799 }, { "completion_length": 743.25, "epoch": 1.6075388026607538, "grad_norm": 0.0, "kl": 0.20818515121936798, "learning_rate": 3.855717643523265e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5800 }, { "completion_length": 664.75, "epoch": 1.6078159645232817, "grad_norm": 0.0, "kl": 0.15460899472236633, "learning_rate": 3.855349793162322e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5801 }, { "completion_length": 604.5, "epoch": 1.6080931263858091, "grad_norm": 0.0, "kl": 1.6582587957382202, "learning_rate": 3.854981901238439e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5802 }, { "completion_length": 827.5, "epoch": 1.608370288248337, "grad_norm": 0.0, "kl": 0.7509946823120117, "learning_rate": 3.854613967762898e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5803 }, { "completion_length": 764.75, "epoch": 1.6086474501108647, "grad_norm": 0.0, "kl": 0.17888964712619781, "learning_rate": 3.854245992746981e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5804 }, { "completion_length": 619.5, "epoch": 1.6089246119733924, "grad_norm": 0.6945925354957581, "kl": 27.4497013092041, "learning_rate": 3.853877976201974e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5805 }, { "completion_length": 647.25, "epoch": 1.6092017738359203, "grad_norm": 0.0, "kl": 0.16163013875484467, "learning_rate": 3.853509918139161e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5806 }, { "completion_length": 673.25, "epoch": 1.6094789356984478, "grad_norm": 0.0, "kl": 0.24636128544807434, "learning_rate": 3.853141818569829e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5807 }, { "completion_length": 697.25, "epoch": 1.6097560975609757, "grad_norm": 0.0, "kl": 0.22994741797447205, "learning_rate": 3.852773677505267e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5808 }, { "completion_length": 710.5, "epoch": 1.6100332594235032, "grad_norm": 0.0, "kl": 0.16001006960868835, "learning_rate": 3.852405494956762e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5809 }, { "completion_length": 680.0, "epoch": 1.610310421286031, "grad_norm": 0.0, "kl": 0.40021297335624695, "learning_rate": 3.852037270935608e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5810 }, { "completion_length": 756.25, "epoch": 1.6105875831485588, "grad_norm": 0.0, "kl": 0.2255827933549881, "learning_rate": 3.851669005453095e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5811 }, { "completion_length": 809.25, "epoch": 1.6108647450110865, "grad_norm": 0.0, "kl": 0.16910217702388763, "learning_rate": 3.851300698520516e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5812 }, { "completion_length": 769.5, "epoch": 1.6111419068736141, "grad_norm": 0.0, "kl": 0.14446014165878296, "learning_rate": 3.850932350149166e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5813 }, { "completion_length": 741.25, "epoch": 1.6114190687361418, "grad_norm": 0.0, "kl": 0.1977369487285614, "learning_rate": 3.850563960350341e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5814 }, { "completion_length": 612.5, "epoch": 1.6116962305986697, "grad_norm": 0.0, "kl": 0.15786992013454437, "learning_rate": 3.850195529135337e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5815 }, { "completion_length": 621.25, "epoch": 1.6119733924611972, "grad_norm": 0.0, "kl": 0.1973728984594345, "learning_rate": 3.8498270565154524e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5816 }, { "completion_length": 638.25, "epoch": 1.612250554323725, "grad_norm": 0.0, "kl": 0.32062071561813354, "learning_rate": 3.849458542501988e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5817 }, { "completion_length": 722.75, "epoch": 1.6125277161862528, "grad_norm": 0.0, "kl": 0.19079715013504028, "learning_rate": 3.849089987106244e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5818 }, { "completion_length": 642.25, "epoch": 1.6128048780487805, "grad_norm": 0.0, "kl": 1.0537831783294678, "learning_rate": 3.848721390339521e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5819 }, { "completion_length": 648.25, "epoch": 1.6130820399113082, "grad_norm": 0.0, "kl": 0.31372547149658203, "learning_rate": 3.8483527522131245e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5820 }, { "completion_length": 668.5, "epoch": 1.6133592017738358, "grad_norm": 0.0, "kl": 0.15412922203540802, "learning_rate": 3.847984072738358e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5821 }, { "completion_length": 640.75, "epoch": 1.6136363636363638, "grad_norm": 0.0, "kl": 0.33520859479904175, "learning_rate": 3.847615351926527e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5822 }, { "completion_length": 602.25, "epoch": 1.6139135254988912, "grad_norm": 0.0, "kl": 0.18444150686264038, "learning_rate": 3.84724658978894e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5823 }, { "completion_length": 628.0, "epoch": 1.6141906873614191, "grad_norm": 0.0, "kl": 0.18929490447044373, "learning_rate": 3.846877786336902e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5824 }, { "completion_length": 705.75, "epoch": 1.6144678492239468, "grad_norm": 0.0, "kl": 0.16975396871566772, "learning_rate": 3.8465089415817275e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5825 }, { "completion_length": 799.25, "epoch": 1.6147450110864745, "grad_norm": 0.0, "kl": 0.16873405873775482, "learning_rate": 3.846140055534725e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5826 }, { "completion_length": 653.75, "epoch": 1.6150221729490022, "grad_norm": 0.0, "kl": 59125.3046875, "learning_rate": 3.845771128207206e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5827 }, { "completion_length": 676.0, "epoch": 1.6152993348115299, "grad_norm": 0.0, "kl": 0.4115675389766693, "learning_rate": 3.845402159610485e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5828 }, { "completion_length": 1019.75, "epoch": 1.6155764966740578, "grad_norm": 0.24338439106941223, "kl": 0.20431867241859436, "learning_rate": 3.8450331497558764e-06, "loss": -0.0, "reward": 1.09375, "reward_std": 1.3125, "rewards/confident_score_func": -0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 5829 }, { "completion_length": 699.5, "epoch": 1.6158536585365852, "grad_norm": 0.0, "kl": 0.21061193943023682, "learning_rate": 3.844664098654697e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5830 }, { "completion_length": 602.25, "epoch": 1.6161308203991132, "grad_norm": 0.0, "kl": 11.134326934814453, "learning_rate": 3.844295006318263e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5831 }, { "completion_length": 605.0, "epoch": 1.6164079822616408, "grad_norm": 0.0, "kl": 0.2709368169307709, "learning_rate": 3.8439258727578925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5832 }, { "completion_length": 601.5, "epoch": 1.6166851441241685, "grad_norm": 0.0, "kl": 0.47346624732017517, "learning_rate": 3.843556697984907e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5833 }, { "completion_length": 650.25, "epoch": 1.6169623059866962, "grad_norm": 0.0, "kl": 0.21919852495193481, "learning_rate": 3.843187482010626e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5834 }, { "completion_length": 669.75, "epoch": 1.617239467849224, "grad_norm": 0.0, "kl": 0.18663516640663147, "learning_rate": 3.842818224846373e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5835 }, { "completion_length": 711.75, "epoch": 1.6175166297117518, "grad_norm": 0.0, "kl": 0.2644008994102478, "learning_rate": 3.84244892650347e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5836 }, { "completion_length": 688.5, "epoch": 1.6177937915742793, "grad_norm": 0.0, "kl": 0.20334665477275848, "learning_rate": 3.842079586993244e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5837 }, { "completion_length": 584.0, "epoch": 1.6180709534368072, "grad_norm": 0.0, "kl": 0.2759084403514862, "learning_rate": 3.84171020632702e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5838 }, { "completion_length": 781.25, "epoch": 1.6183481152993349, "grad_norm": 0.0, "kl": 0.16829583048820496, "learning_rate": 3.841340784516125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5839 }, { "completion_length": 642.5, "epoch": 1.6186252771618626, "grad_norm": 0.0, "kl": 0.18854226171970367, "learning_rate": 3.840971321571888e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5840 }, { "completion_length": 737.75, "epoch": 1.6189024390243902, "grad_norm": 0.0, "kl": 0.16142630577087402, "learning_rate": 3.840601817505638e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5841 }, { "completion_length": 691.0, "epoch": 1.619179600886918, "grad_norm": 0.0, "kl": 0.18501263856887817, "learning_rate": 3.840232272328709e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5842 }, { "completion_length": 568.5, "epoch": 1.6194567627494458, "grad_norm": 0.0, "kl": 0.16722185909748077, "learning_rate": 3.8398626860524295e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5843 }, { "completion_length": 835.25, "epoch": 1.6197339246119733, "grad_norm": 0.3925407826900482, "kl": 0.1688748300075531, "learning_rate": 3.839493058688137e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5844 }, { "completion_length": 671.5, "epoch": 1.6200110864745012, "grad_norm": 0.0, "kl": 0.454990416765213, "learning_rate": 3.839123390247162e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5845 }, { "completion_length": 603.25, "epoch": 1.6202882483370287, "grad_norm": 0.0, "kl": 0.18331624567508698, "learning_rate": 3.838753680740846e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5846 }, { "completion_length": 691.5, "epoch": 1.6205654101995566, "grad_norm": 0.0, "kl": 0.20135873556137085, "learning_rate": 3.838383930180522e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5847 }, { "completion_length": 712.0, "epoch": 1.6208425720620843, "grad_norm": 0.0, "kl": 0.15246926248073578, "learning_rate": 3.8380141385775305e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5848 }, { "completion_length": 737.0, "epoch": 1.621119733924612, "grad_norm": 0.0, "kl": 0.18288037180900574, "learning_rate": 3.837644305943211e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5849 }, { "completion_length": 747.5, "epoch": 1.6213968957871396, "grad_norm": 0.0, "kl": 0.15797822177410126, "learning_rate": 3.837274432288907e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5850 }, { "completion_length": 590.75, "epoch": 1.6216740576496673, "grad_norm": 0.0, "kl": 0.18524257838726044, "learning_rate": 3.836904517625956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5851 }, { "completion_length": 720.25, "epoch": 1.6219512195121952, "grad_norm": 0.0, "kl": 0.1636980026960373, "learning_rate": 3.836534561965708e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5852 }, { "completion_length": 592.75, "epoch": 1.6222283813747227, "grad_norm": 0.0, "kl": 0.18391327559947968, "learning_rate": 3.836164565319503e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5853 }, { "completion_length": 594.75, "epoch": 1.6225055432372506, "grad_norm": 0.0, "kl": 0.16617636382579803, "learning_rate": 3.8357945276986894e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5854 }, { "completion_length": 703.0, "epoch": 1.6227827050997783, "grad_norm": 0.0, "kl": 0.2588501274585724, "learning_rate": 3.835424449114614e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5855 }, { "completion_length": 643.0, "epoch": 1.623059866962306, "grad_norm": 0.0, "kl": 0.22731034457683563, "learning_rate": 3.835054329578627e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5856 }, { "completion_length": 629.0, "epoch": 1.6233370288248337, "grad_norm": 0.0, "kl": 0.19962668418884277, "learning_rate": 3.834684169102077e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5857 }, { "completion_length": 617.0, "epoch": 1.6236141906873613, "grad_norm": 0.0, "kl": 0.1613444834947586, "learning_rate": 3.8343139676963155e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5858 }, { "completion_length": 675.0, "epoch": 1.6238913525498893, "grad_norm": 0.0, "kl": 0.241343691945076, "learning_rate": 3.833943725372696e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5859 }, { "completion_length": 640.0, "epoch": 1.6241685144124167, "grad_norm": 0.0, "kl": 0.1535363346338272, "learning_rate": 3.833573442142571e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5860 }, { "completion_length": 604.0, "epoch": 1.6244456762749446, "grad_norm": 0.9886249303817749, "kl": 245.82484436035156, "learning_rate": 3.833203118017296e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5861 }, { "completion_length": 664.0, "epoch": 1.6247228381374723, "grad_norm": 0.0, "kl": 0.18112295866012573, "learning_rate": 3.832832753008227e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5862 }, { "completion_length": 622.0, "epoch": 1.625, "grad_norm": 0.0, "kl": 0.2406875640153885, "learning_rate": 3.832462347126722e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5863 }, { "completion_length": 688.0, "epoch": 1.6252771618625277, "grad_norm": 0.0, "kl": 0.17328470945358276, "learning_rate": 3.8320919003841396e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5864 }, { "completion_length": 695.0, "epoch": 1.6255543237250554, "grad_norm": 0.0, "kl": 4345.31201171875, "learning_rate": 3.831721412791841e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5865 }, { "completion_length": 762.75, "epoch": 1.6258314855875833, "grad_norm": 0.0, "kl": 0.224238783121109, "learning_rate": 3.8313508843611854e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5866 }, { "completion_length": 634.25, "epoch": 1.6261086474501107, "grad_norm": 0.0, "kl": 0.20220552384853363, "learning_rate": 3.830980315103536e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5867 }, { "completion_length": 653.5, "epoch": 1.6263858093126387, "grad_norm": 0.0, "kl": 0.17764100432395935, "learning_rate": 3.830609705030258e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5868 }, { "completion_length": 604.25, "epoch": 1.6266629711751663, "grad_norm": 0.0, "kl": 0.1690572202205658, "learning_rate": 3.830239054152716e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5869 }, { "completion_length": 666.25, "epoch": 1.626940133037694, "grad_norm": 0.0, "kl": 0.2139982432126999, "learning_rate": 3.829868362482275e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5870 }, { "completion_length": 609.5, "epoch": 1.6272172949002217, "grad_norm": 0.9482789039611816, "kl": 1.0344436168670654, "learning_rate": 3.829497630030305e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5871 }, { "completion_length": 613.25, "epoch": 1.6274944567627494, "grad_norm": 0.0, "kl": 0.17812590301036835, "learning_rate": 3.829126856808171e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5872 }, { "completion_length": 734.25, "epoch": 1.6277716186252773, "grad_norm": 0.0, "kl": 0.15740866959095, "learning_rate": 3.828756042827246e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5873 }, { "completion_length": 677.0, "epoch": 1.6280487804878048, "grad_norm": 0.0, "kl": 0.17944423854351044, "learning_rate": 3.828385188098902e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5874 }, { "completion_length": 645.0, "epoch": 1.6283259423503327, "grad_norm": 0.0, "kl": 0.1902446746826172, "learning_rate": 3.828014292634508e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5875 }, { "completion_length": 677.5, "epoch": 1.6286031042128604, "grad_norm": 0.0, "kl": 0.2778947651386261, "learning_rate": 3.827643356445442e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5876 }, { "completion_length": 746.0, "epoch": 1.628880266075388, "grad_norm": 0.36676380038261414, "kl": 0.18478401005268097, "learning_rate": 3.8272723795430765e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5877 }, { "completion_length": 650.0, "epoch": 1.6291574279379157, "grad_norm": 0.0, "kl": 0.18412290513515472, "learning_rate": 3.8269013619387885e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5878 }, { "completion_length": 647.25, "epoch": 1.6294345898004434, "grad_norm": 0.45630335807800293, "kl": 0.17687779664993286, "learning_rate": 3.826530303643955e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5879 }, { "completion_length": 620.25, "epoch": 1.6297117516629713, "grad_norm": 0.0, "kl": 8.822480201721191, "learning_rate": 3.826159204669956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5880 }, { "completion_length": 634.75, "epoch": 1.6299889135254988, "grad_norm": 0.0, "kl": 0.5335948467254639, "learning_rate": 3.825788065028171e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5881 }, { "completion_length": 677.0, "epoch": 1.6302660753880267, "grad_norm": 0.0, "kl": 0.16831432282924652, "learning_rate": 3.82541688472998e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5882 }, { "completion_length": 775.75, "epoch": 1.6305432372505542, "grad_norm": 0.0, "kl": 0.32657697796821594, "learning_rate": 3.825045663786767e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5883 }, { "completion_length": 567.75, "epoch": 1.630820399113082, "grad_norm": 0.0, "kl": 0.19260555505752563, "learning_rate": 3.824674402209917e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5884 }, { "completion_length": 677.5, "epoch": 1.6310975609756098, "grad_norm": 0.0, "kl": 0.16210860013961792, "learning_rate": 3.824303100010812e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5885 }, { "completion_length": 691.25, "epoch": 1.6313747228381374, "grad_norm": 0.0, "kl": 0.1719006896018982, "learning_rate": 3.823931757200841e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5886 }, { "completion_length": 644.75, "epoch": 1.6316518847006651, "grad_norm": 0.0, "kl": 0.258259117603302, "learning_rate": 3.82356037379139e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5887 }, { "completion_length": 679.5, "epoch": 1.6319290465631928, "grad_norm": 0.0, "kl": 0.1794218271970749, "learning_rate": 3.8231889497938475e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5888 }, { "completion_length": 675.25, "epoch": 1.6322062084257207, "grad_norm": 0.0, "kl": 0.17833741009235382, "learning_rate": 3.822817485219605e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5889 }, { "completion_length": 667.5, "epoch": 1.6324833702882482, "grad_norm": 0.5945319533348083, "kl": 2560.87646484375, "learning_rate": 3.822445980080053e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5890 }, { "completion_length": 631.0, "epoch": 1.632760532150776, "grad_norm": 0.4650273621082306, "kl": 65352.12109375, "learning_rate": 3.822074434386584e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5891 }, { "completion_length": 670.0, "epoch": 1.6330376940133038, "grad_norm": 0.0, "kl": 0.16779249906539917, "learning_rate": 3.821702848150591e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5892 }, { "completion_length": 622.5, "epoch": 1.6333148558758315, "grad_norm": 0.0, "kl": 0.2082122564315796, "learning_rate": 3.821331221383471e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5893 }, { "completion_length": 668.25, "epoch": 1.6335920177383592, "grad_norm": 0.0, "kl": 0.2081974744796753, "learning_rate": 3.8209595540966185e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5894 }, { "completion_length": 691.0, "epoch": 1.6338691796008868, "grad_norm": 0.0, "kl": 0.17020614445209503, "learning_rate": 3.8205878463014316e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5895 }, { "completion_length": 747.0, "epoch": 1.6341463414634148, "grad_norm": 1.1890395879745483, "kl": 0.7025078535079956, "learning_rate": 3.820216098009309e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5896 }, { "completion_length": 727.75, "epoch": 1.6344235033259422, "grad_norm": 0.0, "kl": 0.17396923899650574, "learning_rate": 3.819844309231651e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5897 }, { "completion_length": 600.5, "epoch": 1.6347006651884701, "grad_norm": 0.0, "kl": 0.2962343394756317, "learning_rate": 3.819472479979857e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5898 }, { "completion_length": 634.25, "epoch": 1.6349778270509978, "grad_norm": 0.0, "kl": 0.18403054773807526, "learning_rate": 3.819100610265332e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5899 }, { "completion_length": 684.25, "epoch": 1.6352549889135255, "grad_norm": 0.0, "kl": 0.18330281972885132, "learning_rate": 3.818728700099479e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5900 }, { "completion_length": 625.25, "epoch": 1.6355321507760532, "grad_norm": 0.0, "kl": 0.19030870497226715, "learning_rate": 3.818356749493703e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5901 }, { "completion_length": 646.25, "epoch": 1.6358093126385809, "grad_norm": 0.0, "kl": 0.18089956045150757, "learning_rate": 3.8179847584594075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5902 }, { "completion_length": 617.25, "epoch": 1.6360864745011088, "grad_norm": 0.0, "kl": 0.24097873270511627, "learning_rate": 3.817612727008003e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5903 }, { "completion_length": 691.0, "epoch": 1.6363636363636362, "grad_norm": 1.7034566402435303, "kl": 2.5365653038024902, "learning_rate": 3.817240655150898e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5904 }, { "completion_length": 740.75, "epoch": 1.6366407982261642, "grad_norm": 0.0, "kl": 0.420387327671051, "learning_rate": 3.816868542899502e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5905 }, { "completion_length": 715.0, "epoch": 1.6369179600886918, "grad_norm": 0.0, "kl": 0.18157194554805756, "learning_rate": 3.816496390265224e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5906 }, { "completion_length": 789.25, "epoch": 1.6371951219512195, "grad_norm": 0.9376950860023499, "kl": 1.4784820079803467, "learning_rate": 3.816124197259479e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5907 }, { "completion_length": 691.5, "epoch": 1.6374722838137472, "grad_norm": 0.0, "kl": 0.7027139663696289, "learning_rate": 3.815751963893681e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5908 }, { "completion_length": 592.5, "epoch": 1.637749445676275, "grad_norm": 0.0, "kl": 4.492347717285156, "learning_rate": 3.815379690179244e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5909 }, { "completion_length": 745.5, "epoch": 1.6380266075388028, "grad_norm": 0.0, "kl": 0.1871691793203354, "learning_rate": 3.815007376127582e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5910 }, { "completion_length": 866.25, "epoch": 1.6383037694013303, "grad_norm": 0.0, "kl": 0.1906217485666275, "learning_rate": 3.8146350217501148e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5911 }, { "completion_length": 818.0, "epoch": 1.6385809312638582, "grad_norm": 0.0, "kl": 0.2356404960155487, "learning_rate": 3.8142626270582607e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5912 }, { "completion_length": 765.75, "epoch": 1.6388580931263859, "grad_norm": 0.0, "kl": 0.19100302457809448, "learning_rate": 3.8138901920634373e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5913 }, { "completion_length": 808.0, "epoch": 1.6391352549889135, "grad_norm": 0.0, "kl": 0.16428668797016144, "learning_rate": 3.813517716777069e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5914 }, { "completion_length": 672.25, "epoch": 1.6394124168514412, "grad_norm": 0.0, "kl": 0.17429661750793457, "learning_rate": 3.813145201210576e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5915 }, { "completion_length": 711.25, "epoch": 1.639689578713969, "grad_norm": 0.0, "kl": 0.14197306334972382, "learning_rate": 3.812772645375382e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5916 }, { "completion_length": 570.75, "epoch": 1.6399667405764968, "grad_norm": 0.0, "kl": 0.2328600287437439, "learning_rate": 3.8124000492829126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5917 }, { "completion_length": 661.0, "epoch": 1.6402439024390243, "grad_norm": 0.0, "kl": 0.16192366182804108, "learning_rate": 3.812027412944592e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5918 }, { "completion_length": 584.75, "epoch": 1.6405210643015522, "grad_norm": 0.0, "kl": 0.17340147495269775, "learning_rate": 3.8116547363718504e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5919 }, { "completion_length": 630.25, "epoch": 1.6407982261640797, "grad_norm": 0.0, "kl": 0.3522666096687317, "learning_rate": 3.8112820195761124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5920 }, { "completion_length": 792.25, "epoch": 1.6410753880266076, "grad_norm": 0.0, "kl": 0.1875380128622055, "learning_rate": 3.8109092625688103e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5921 }, { "completion_length": 587.5, "epoch": 1.6413525498891353, "grad_norm": 0.0, "kl": 0.16326197981834412, "learning_rate": 3.8105364653613746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5922 }, { "completion_length": 668.0, "epoch": 1.641629711751663, "grad_norm": 0.0, "kl": 0.24153943359851837, "learning_rate": 3.8101636279652375e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5923 }, { "completion_length": 724.75, "epoch": 1.6419068736141909, "grad_norm": 0.0, "kl": 0.13266077637672424, "learning_rate": 3.809790750391832e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5924 }, { "completion_length": 554.5, "epoch": 1.6421840354767183, "grad_norm": 0.0, "kl": 0.33440205454826355, "learning_rate": 3.809417832652592e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5925 }, { "completion_length": 758.25, "epoch": 1.6424611973392462, "grad_norm": 0.0, "kl": 0.6103653907775879, "learning_rate": 3.809044874758955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5926 }, { "completion_length": 678.5, "epoch": 1.6427383592017737, "grad_norm": 0.0, "kl": 0.23264697194099426, "learning_rate": 3.808671876722357e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5927 }, { "completion_length": 629.75, "epoch": 1.6430155210643016, "grad_norm": 0.0, "kl": 0.2013588547706604, "learning_rate": 3.808298838554235e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5928 }, { "completion_length": 652.5, "epoch": 1.6432926829268293, "grad_norm": 0.0, "kl": 0.16935022175312042, "learning_rate": 3.8079257602660326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5929 }, { "completion_length": 659.75, "epoch": 1.643569844789357, "grad_norm": 0.0, "kl": 0.17849503457546234, "learning_rate": 3.807552641869186e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5930 }, { "completion_length": 682.5, "epoch": 1.6438470066518847, "grad_norm": 0.0, "kl": 0.2419130504131317, "learning_rate": 3.8071794833751403e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5931 }, { "completion_length": 742.25, "epoch": 1.6441241685144123, "grad_norm": 0.0, "kl": 0.18754492700099945, "learning_rate": 3.806806284795337e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5932 }, { "completion_length": 795.25, "epoch": 1.6444013303769403, "grad_norm": 0.0, "kl": 0.15011665225028992, "learning_rate": 3.8064330461412215e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5933 }, { "completion_length": 636.5, "epoch": 1.6446784922394677, "grad_norm": 0.4129515588283539, "kl": 6882.423828125, "learning_rate": 3.8060597674242383e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5934 }, { "completion_length": 774.5, "epoch": 1.6449556541019956, "grad_norm": 0.0, "kl": 0.1418670266866684, "learning_rate": 3.8056864486558354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5935 }, { "completion_length": 629.0, "epoch": 1.6452328159645233, "grad_norm": 0.0, "kl": 0.5767979025840759, "learning_rate": 3.805313089847461e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5936 }, { "completion_length": 786.0, "epoch": 1.645509977827051, "grad_norm": 0.0, "kl": 1305.876220703125, "learning_rate": 3.804939691010564e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5937 }, { "completion_length": 636.5, "epoch": 1.6457871396895787, "grad_norm": 0.0, "kl": 0.23414596915245056, "learning_rate": 3.804566252156595e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5938 }, { "completion_length": 680.25, "epoch": 1.6460643015521064, "grad_norm": 0.0, "kl": 0.21465495228767395, "learning_rate": 3.8041927732970056e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5939 }, { "completion_length": 650.25, "epoch": 1.6463414634146343, "grad_norm": 0.0, "kl": 0.2260921597480774, "learning_rate": 3.80381925444325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5940 }, { "completion_length": 723.5, "epoch": 1.6466186252771617, "grad_norm": 0.0, "kl": 0.1971454620361328, "learning_rate": 3.803445695606781e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5941 }, { "completion_length": 537.0, "epoch": 1.6468957871396896, "grad_norm": 0.0, "kl": 0.19223302602767944, "learning_rate": 3.8030720967990554e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5942 }, { "completion_length": 689.25, "epoch": 1.6471729490022173, "grad_norm": 0.0, "kl": 0.2030925452709198, "learning_rate": 3.802698458031528e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5943 }, { "completion_length": 627.5, "epoch": 1.647450110864745, "grad_norm": 0.0, "kl": 0.46674421429634094, "learning_rate": 3.802324779315659e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5944 }, { "completion_length": 668.25, "epoch": 1.6477272727272727, "grad_norm": 0.0, "kl": 0.1669442057609558, "learning_rate": 3.8019510606629063e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5945 }, { "completion_length": 627.5, "epoch": 1.6480044345898004, "grad_norm": 0.0, "kl": 0.45450732111930847, "learning_rate": 3.80157730208473e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5946 }, { "completion_length": 596.75, "epoch": 1.6482815964523283, "grad_norm": 0.0, "kl": 0.18923193216323853, "learning_rate": 3.801203503592593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5947 }, { "completion_length": 745.0, "epoch": 1.6485587583148558, "grad_norm": 0.0, "kl": 22657.0546875, "learning_rate": 3.800829665197957e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5948 }, { "completion_length": 717.5, "epoch": 1.6488359201773837, "grad_norm": 0.4125618040561676, "kl": 0.14338402450084686, "learning_rate": 3.8004557869122873e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5949 }, { "completion_length": 696.5, "epoch": 1.6491130820399114, "grad_norm": 0.0, "kl": 0.13495910167694092, "learning_rate": 3.8000818687470476e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5950 }, { "completion_length": 724.75, "epoch": 1.649390243902439, "grad_norm": 0.0, "kl": 0.15306273102760315, "learning_rate": 3.799707910713705e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5951 }, { "completion_length": 723.25, "epoch": 1.6496674057649667, "grad_norm": 0.0, "kl": 0.34314337372779846, "learning_rate": 3.7993339128237277e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5952 }, { "completion_length": 737.75, "epoch": 1.6499445676274944, "grad_norm": 0.0, "kl": 0.130838081240654, "learning_rate": 3.798959875088584e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5953 }, { "completion_length": 660.75, "epoch": 1.6502217294900223, "grad_norm": 0.4402901232242584, "kl": 6189.7958984375, "learning_rate": 3.798585797519745e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5954 }, { "completion_length": 676.5, "epoch": 1.6504988913525498, "grad_norm": 0.0, "kl": 0.1505325585603714, "learning_rate": 3.7982116801286816e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5955 }, { "completion_length": 668.0, "epoch": 1.6507760532150777, "grad_norm": 0.0, "kl": 0.23227408528327942, "learning_rate": 3.797837522926866e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5956 }, { "completion_length": 693.0, "epoch": 1.6510532150776052, "grad_norm": 0.0, "kl": 0.23264530301094055, "learning_rate": 3.797463325925773e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5957 }, { "completion_length": 703.0, "epoch": 1.651330376940133, "grad_norm": 0.0, "kl": 0.18075606226921082, "learning_rate": 3.797089089136877e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5958 }, { "completion_length": 654.5, "epoch": 1.6516075388026608, "grad_norm": 0.0, "kl": 0.16441909968852997, "learning_rate": 3.796714812571654e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5959 }, { "completion_length": 760.5, "epoch": 1.6518847006651884, "grad_norm": 0.0, "kl": 0.15862341225147247, "learning_rate": 3.7963404962415818e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5960 }, { "completion_length": 686.0, "epoch": 1.6521618625277164, "grad_norm": 0.0, "kl": 0.17896337807178497, "learning_rate": 3.7959661401581404e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5961 }, { "completion_length": 605.5, "epoch": 1.6524390243902438, "grad_norm": 0.0, "kl": 0.20701926946640015, "learning_rate": 3.795591744332807e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5962 }, { "completion_length": 805.0, "epoch": 1.6527161862527717, "grad_norm": 0.0, "kl": 0.19901636242866516, "learning_rate": 3.7952173087770657e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5963 }, { "completion_length": 747.25, "epoch": 1.6529933481152992, "grad_norm": 0.0, "kl": 0.15892939269542694, "learning_rate": 3.7948428335023968e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5964 }, { "completion_length": 669.25, "epoch": 1.653270509977827, "grad_norm": 0.0, "kl": 0.18077874183654785, "learning_rate": 3.794468318520285e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5965 }, { "completion_length": 720.75, "epoch": 1.6535476718403548, "grad_norm": 0.0, "kl": 762.5700073242188, "learning_rate": 3.794093763842214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5966 }, { "completion_length": 718.0, "epoch": 1.6538248337028825, "grad_norm": 0.0, "kl": 9.261897087097168, "learning_rate": 3.7937191694796703e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5967 }, { "completion_length": 682.25, "epoch": 1.6541019955654102, "grad_norm": 0.0, "kl": 0.20747841894626617, "learning_rate": 3.793344535444142e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5968 }, { "completion_length": 639.5, "epoch": 1.6543791574279378, "grad_norm": 0.0, "kl": 0.19969932734966278, "learning_rate": 3.792969861747117e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5969 }, { "completion_length": 695.5, "epoch": 1.6546563192904657, "grad_norm": 0.0, "kl": 0.17087428271770477, "learning_rate": 3.7925951484000855e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5970 }, { "completion_length": 690.0, "epoch": 1.6549334811529932, "grad_norm": 0.0, "kl": 0.20441946387290955, "learning_rate": 3.792220395414537e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5971 }, { "completion_length": 893.75, "epoch": 1.6552106430155211, "grad_norm": 0.6731313467025757, "kl": 385.9778137207031, "learning_rate": 3.7918456028019656e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5972 }, { "completion_length": 654.25, "epoch": 1.6554878048780488, "grad_norm": 0.0, "kl": 0.16710679233074188, "learning_rate": 3.791470770573862e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5973 }, { "completion_length": 799.25, "epoch": 1.6557649667405765, "grad_norm": 0.0, "kl": 0.22685936093330383, "learning_rate": 3.7910958987417228e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5974 }, { "completion_length": 684.25, "epoch": 1.6560421286031042, "grad_norm": 0.0, "kl": 0.15414448082447052, "learning_rate": 3.7907209873170436e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5975 }, { "completion_length": 663.0, "epoch": 1.6563192904656319, "grad_norm": 0.0, "kl": 0.3047838509082794, "learning_rate": 3.790346036311321e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5976 }, { "completion_length": 633.0, "epoch": 1.6565964523281598, "grad_norm": 0.0, "kl": 0.5238867402076721, "learning_rate": 3.7899710457360526e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5977 }, { "completion_length": 755.5, "epoch": 1.6568736141906872, "grad_norm": 0.0, "kl": 0.16367574036121368, "learning_rate": 3.789596015602739e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5978 }, { "completion_length": 699.0, "epoch": 1.6571507760532151, "grad_norm": 0.0, "kl": 0.20416444540023804, "learning_rate": 3.7892209459228802e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5979 }, { "completion_length": 734.0, "epoch": 1.6574279379157428, "grad_norm": 0.0, "kl": 0.18528582155704498, "learning_rate": 3.788845836707977e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5980 }, { "completion_length": 632.5, "epoch": 1.6577050997782705, "grad_norm": 0.0, "kl": 0.18067681789398193, "learning_rate": 3.788470687969534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5981 }, { "completion_length": 790.75, "epoch": 1.6579822616407982, "grad_norm": 1.9126248359680176, "kl": 2.0699965953826904, "learning_rate": 3.7880954997190557e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5982 }, { "completion_length": 680.75, "epoch": 1.658259423503326, "grad_norm": 0.0, "kl": 0.1633801907300949, "learning_rate": 3.787720271968046e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5983 }, { "completion_length": 783.5, "epoch": 1.6585365853658538, "grad_norm": 0.0, "kl": 0.16863372921943665, "learning_rate": 3.7873450047280125e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5984 }, { "completion_length": 719.75, "epoch": 1.6588137472283813, "grad_norm": 0.0, "kl": 0.20791693031787872, "learning_rate": 3.7869696980104633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5985 }, { "completion_length": 696.25, "epoch": 1.6590909090909092, "grad_norm": 0.0, "kl": 0.16852234303951263, "learning_rate": 3.7865943518269064e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5986 }, { "completion_length": 671.5, "epoch": 1.6593680709534369, "grad_norm": 0.35738319158554077, "kl": 0.143351212143898, "learning_rate": 3.786218966188854e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5987 }, { "completion_length": 664.25, "epoch": 1.6596452328159645, "grad_norm": 0.0, "kl": 0.16136877238750458, "learning_rate": 3.7858435411078143e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5988 }, { "completion_length": 603.0, "epoch": 1.6599223946784922, "grad_norm": 0.0, "kl": 0.4678838849067688, "learning_rate": 3.785468076595304e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5989 }, { "completion_length": 706.0, "epoch": 1.66019955654102, "grad_norm": 0.35475999116897583, "kl": 0.1482933610677719, "learning_rate": 3.785092572662834e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5990 }, { "completion_length": 715.25, "epoch": 1.6604767184035478, "grad_norm": 0.0, "kl": 0.15372028946876526, "learning_rate": 3.7847170293219223e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5991 }, { "completion_length": 687.0, "epoch": 1.6607538802660753, "grad_norm": 0.3216225802898407, "kl": 0.17636924982070923, "learning_rate": 3.7843414465840823e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5992 }, { "completion_length": 701.25, "epoch": 1.6610310421286032, "grad_norm": 0.0, "kl": 0.2818928062915802, "learning_rate": 3.7839658244608336e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5993 }, { "completion_length": 673.5, "epoch": 1.6613082039911307, "grad_norm": 0.0, "kl": 0.15584516525268555, "learning_rate": 3.7835901629636933e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5994 }, { "completion_length": 654.25, "epoch": 1.6615853658536586, "grad_norm": 0.0, "kl": 0.18423813581466675, "learning_rate": 3.783214462104183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5995 }, { "completion_length": 686.75, "epoch": 1.6618625277161863, "grad_norm": 0.0, "kl": 0.17745539546012878, "learning_rate": 3.782838721893823e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5996 }, { "completion_length": 712.75, "epoch": 1.662139689578714, "grad_norm": 0.0, "kl": 0.24273216724395752, "learning_rate": 3.782462942344136e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5997 }, { "completion_length": 878.25, "epoch": 1.6624168514412418, "grad_norm": 0.35444214940071106, "kl": 0.12400400638580322, "learning_rate": 3.782087123466645e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5998 }, { "completion_length": 799.5, "epoch": 1.6626940133037693, "grad_norm": 0.0, "kl": 0.1325085163116455, "learning_rate": 3.781711265272876e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 5999 }, { "completion_length": 713.75, "epoch": 1.6629711751662972, "grad_norm": 0.0, "kl": 0.9310581684112549, "learning_rate": 3.7813353677743535e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6000 }, { "completion_length": 641.0, "epoch": 1.6632483370288247, "grad_norm": 0.0, "kl": 0.1886093020439148, "learning_rate": 3.780959430982606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6001 }, { "completion_length": 791.0, "epoch": 1.6635254988913526, "grad_norm": 0.4935452342033386, "kl": 823.9701538085938, "learning_rate": 3.7805834549091617e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6002 }, { "completion_length": 757.0, "epoch": 1.6638026607538803, "grad_norm": 0.0, "kl": 0.13668511807918549, "learning_rate": 3.7802074395655497e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6003 }, { "completion_length": 628.25, "epoch": 1.664079822616408, "grad_norm": 0.0, "kl": 21.010286331176758, "learning_rate": 3.7798313849633006e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6004 }, { "completion_length": 828.0, "epoch": 1.6643569844789357, "grad_norm": 0.0, "kl": 0.13900405168533325, "learning_rate": 3.7794552911139472e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6005 }, { "completion_length": 708.25, "epoch": 1.6646341463414633, "grad_norm": 0.0, "kl": 0.16366951167583466, "learning_rate": 3.7790791580290227e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6006 }, { "completion_length": 832.0, "epoch": 1.6649113082039912, "grad_norm": 0.0, "kl": 0.12902328372001648, "learning_rate": 3.7787029857200606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6007 }, { "completion_length": 715.5, "epoch": 1.6651884700665187, "grad_norm": 0.0, "kl": 0.22321613132953644, "learning_rate": 3.778326774198598e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6008 }, { "completion_length": 625.0, "epoch": 1.6654656319290466, "grad_norm": 0.0, "kl": 0.2196655571460724, "learning_rate": 3.7779505234761704e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6009 }, { "completion_length": 705.0, "epoch": 1.6657427937915743, "grad_norm": 0.0, "kl": 0.20975558459758759, "learning_rate": 3.7775742335643163e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6010 }, { "completion_length": 723.0, "epoch": 1.666019955654102, "grad_norm": 0.0, "kl": 0.15620604157447815, "learning_rate": 3.7771979044745757e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6011 }, { "completion_length": 778.0, "epoch": 1.6662971175166297, "grad_norm": 0.0, "kl": 0.23060455918312073, "learning_rate": 3.776821536218488e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6012 }, { "completion_length": 794.5, "epoch": 1.6665742793791574, "grad_norm": 0.5840645432472229, "kl": 0.34558606147766113, "learning_rate": 3.7764451288075944e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6013 }, { "completion_length": 748.25, "epoch": 1.6668514412416853, "grad_norm": 0.0, "kl": 0.16299423575401306, "learning_rate": 3.7760686822534397e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6014 }, { "completion_length": 602.5, "epoch": 1.6671286031042127, "grad_norm": 0.48191601037979126, "kl": 0.4383774995803833, "learning_rate": 3.7756921965675665e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6015 }, { "completion_length": 715.25, "epoch": 1.6674057649667406, "grad_norm": 0.3930602967739105, "kl": 0.20112942159175873, "learning_rate": 3.77531567176152e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6016 }, { "completion_length": 612.75, "epoch": 1.6676829268292683, "grad_norm": 0.0, "kl": 0.16051940619945526, "learning_rate": 3.7749391078468476e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6017 }, { "completion_length": 829.0, "epoch": 1.667960088691796, "grad_norm": 0.3006644546985626, "kl": 0.20969392359256744, "learning_rate": 3.7745625048350963e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6018 }, { "completion_length": 684.0, "epoch": 1.6682372505543237, "grad_norm": 0.0, "kl": 0.189577117562294, "learning_rate": 3.774185862737815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6019 }, { "completion_length": 598.75, "epoch": 1.6685144124168514, "grad_norm": 0.0, "kl": 0.16858403384685516, "learning_rate": 3.7738091815665536e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6020 }, { "completion_length": 711.0, "epoch": 1.6687915742793793, "grad_norm": 0.0, "kl": 0.5243553519248962, "learning_rate": 3.7734324613328633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6021 }, { "completion_length": 616.75, "epoch": 1.6690687361419068, "grad_norm": 0.37013140320777893, "kl": 1761.3775634765625, "learning_rate": 3.7730557020482974e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6022 }, { "completion_length": 677.0, "epoch": 1.6693458980044347, "grad_norm": 0.37668412923812866, "kl": 0.19337807595729828, "learning_rate": 3.772678903724408e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6023 }, { "completion_length": 709.5, "epoch": 1.6696230598669624, "grad_norm": 0.0, "kl": 0.2008742392063141, "learning_rate": 3.7723020663727516e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6024 }, { "completion_length": 779.5, "epoch": 1.66990022172949, "grad_norm": 0.0, "kl": 0.17062094807624817, "learning_rate": 3.771925190004883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6025 }, { "completion_length": 741.5, "epoch": 1.6701773835920177, "grad_norm": 0.0, "kl": 0.3981176018714905, "learning_rate": 3.77154827463236e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6026 }, { "completion_length": 659.0, "epoch": 1.6704545454545454, "grad_norm": 0.0, "kl": 0.17571279406547546, "learning_rate": 3.7711713202667414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6027 }, { "completion_length": 743.75, "epoch": 1.6707317073170733, "grad_norm": 0.0, "kl": 0.1727123260498047, "learning_rate": 3.770794326919586e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6028 }, { "completion_length": 627.5, "epoch": 1.6710088691796008, "grad_norm": 0.0, "kl": 0.3464869260787964, "learning_rate": 3.7704172946024553e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6029 }, { "completion_length": 671.25, "epoch": 1.6712860310421287, "grad_norm": 0.0, "kl": 0.17254683375358582, "learning_rate": 3.7700402233269105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6030 }, { "completion_length": 649.75, "epoch": 1.6715631929046562, "grad_norm": 0.0, "kl": 0.15816858410835266, "learning_rate": 3.769663113104516e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6031 }, { "completion_length": 747.75, "epoch": 1.671840354767184, "grad_norm": 0.0, "kl": 0.8060569167137146, "learning_rate": 3.7692859639468347e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6032 }, { "completion_length": 767.0, "epoch": 1.6721175166297118, "grad_norm": 0.0, "kl": 0.27777335047721863, "learning_rate": 3.768908775865433e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6033 }, { "completion_length": 735.25, "epoch": 1.6723946784922394, "grad_norm": 0.0, "kl": 0.18619367480278015, "learning_rate": 3.7685315488718778e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6034 }, { "completion_length": 645.75, "epoch": 1.6726718403547673, "grad_norm": 0.0, "kl": 0.2035406231880188, "learning_rate": 3.7681542829777366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6035 }, { "completion_length": 617.0, "epoch": 1.6729490022172948, "grad_norm": 0.0, "kl": 0.4310673177242279, "learning_rate": 3.7677769781945794e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6036 }, { "completion_length": 705.0, "epoch": 1.6732261640798227, "grad_norm": 0.0, "kl": 0.19505570828914642, "learning_rate": 3.767399634533976e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6037 }, { "completion_length": 681.5, "epoch": 1.6735033259423502, "grad_norm": 0.0, "kl": 0.1868034154176712, "learning_rate": 3.7670222520074983e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6038 }, { "completion_length": 718.25, "epoch": 1.673780487804878, "grad_norm": 0.0, "kl": 0.4623681306838989, "learning_rate": 3.766644830626719e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6039 }, { "completion_length": 616.0, "epoch": 1.6740576496674058, "grad_norm": 0.0, "kl": 0.5362570881843567, "learning_rate": 3.7662673704032117e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6040 }, { "completion_length": 584.0, "epoch": 1.6743348115299335, "grad_norm": 0.0, "kl": 0.18829737603664398, "learning_rate": 3.7658898713485507e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6041 }, { "completion_length": 722.5, "epoch": 1.6746119733924612, "grad_norm": 0.0, "kl": 0.3868751525878906, "learning_rate": 3.7655123334743147e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6042 }, { "completion_length": 703.75, "epoch": 1.6748891352549888, "grad_norm": 0.0, "kl": 0.22747087478637695, "learning_rate": 3.765134756792079e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6043 }, { "completion_length": 747.0, "epoch": 1.6751662971175167, "grad_norm": 0.0, "kl": 0.1568242758512497, "learning_rate": 3.7647571413134236e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6044 }, { "completion_length": 684.25, "epoch": 1.6754434589800442, "grad_norm": 0.0, "kl": 0.14182372391223907, "learning_rate": 3.7643794870499277e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6045 }, { "completion_length": 590.75, "epoch": 1.6757206208425721, "grad_norm": 0.0, "kl": 0.2063199281692505, "learning_rate": 3.764001794013173e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6046 }, { "completion_length": 591.25, "epoch": 1.6759977827050998, "grad_norm": 0.0, "kl": 0.18161699175834656, "learning_rate": 3.7636240622147403e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6047 }, { "completion_length": 620.75, "epoch": 1.6762749445676275, "grad_norm": 0.0, "kl": 0.19996479153633118, "learning_rate": 3.763246291666215e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6048 }, { "completion_length": 615.0, "epoch": 1.6765521064301552, "grad_norm": 0.0, "kl": 0.1799633800983429, "learning_rate": 3.7628684823791805e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6049 }, { "completion_length": 872.5, "epoch": 1.6768292682926829, "grad_norm": 0.0, "kl": 0.16480900347232819, "learning_rate": 3.762490634365224e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6050 }, { "completion_length": 739.0, "epoch": 1.6771064301552108, "grad_norm": 0.0, "kl": 0.5628191232681274, "learning_rate": 3.762112747635931e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6051 }, { "completion_length": 782.25, "epoch": 1.6773835920177382, "grad_norm": 0.0, "kl": 0.23151998221874237, "learning_rate": 3.76173482220289e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6052 }, { "completion_length": 757.75, "epoch": 1.6776607538802661, "grad_norm": 0.0, "kl": 0.4866238236427307, "learning_rate": 3.7613568580776912e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6053 }, { "completion_length": 731.0, "epoch": 1.6779379157427938, "grad_norm": 0.0, "kl": 0.20414288341999054, "learning_rate": 3.760978855271924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6054 }, { "completion_length": 735.5, "epoch": 1.6782150776053215, "grad_norm": 0.0, "kl": 0.8265487551689148, "learning_rate": 3.760600813797181e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6055 }, { "completion_length": 675.5, "epoch": 1.6784922394678492, "grad_norm": 0.42468777298927307, "kl": 108472.71875, "learning_rate": 3.7602227336650553e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6056 }, { "completion_length": 641.0, "epoch": 1.6787694013303769, "grad_norm": 0.0, "kl": 0.18962255120277405, "learning_rate": 3.759844614887141e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6057 }, { "completion_length": 651.0, "epoch": 1.6790465631929048, "grad_norm": 0.0, "kl": 0.21077410876750946, "learning_rate": 3.759466457475033e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6058 }, { "completion_length": 918.25, "epoch": 1.6793237250554323, "grad_norm": 0.0, "kl": 0.21749010682106018, "learning_rate": 3.7590882614403278e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6059 }, { "completion_length": 660.0, "epoch": 1.6796008869179602, "grad_norm": 0.0, "kl": 0.18489034473896027, "learning_rate": 3.758710026794623e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6060 }, { "completion_length": 672.75, "epoch": 1.6798780487804879, "grad_norm": 0.0, "kl": 0.1861332207918167, "learning_rate": 3.758331753549519e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6061 }, { "completion_length": 875.0, "epoch": 1.6801552106430155, "grad_norm": 0.0, "kl": 0.1409870982170105, "learning_rate": 3.757953441716613e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6062 }, { "completion_length": 618.0, "epoch": 1.6804323725055432, "grad_norm": 0.0, "kl": 814.4603271484375, "learning_rate": 3.757575091307509e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6063 }, { "completion_length": 634.25, "epoch": 1.680709534368071, "grad_norm": 0.0, "kl": 0.41812482476234436, "learning_rate": 3.757196702333808e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6064 }, { "completion_length": 674.75, "epoch": 1.6809866962305988, "grad_norm": 0.0, "kl": 0.26670241355895996, "learning_rate": 3.756818274807114e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6065 }, { "completion_length": 653.25, "epoch": 1.6812638580931263, "grad_norm": 0.0, "kl": 0.17101237177848816, "learning_rate": 3.7564398087390317e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6066 }, { "completion_length": 731.25, "epoch": 1.6815410199556542, "grad_norm": 0.0, "kl": 0.1909511685371399, "learning_rate": 3.756061304141167e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6067 }, { "completion_length": 755.5, "epoch": 1.6818181818181817, "grad_norm": 0.0, "kl": 0.1533581167459488, "learning_rate": 3.755682761025128e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6068 }, { "completion_length": 709.5, "epoch": 1.6820953436807096, "grad_norm": 0.0, "kl": 0.1671089380979538, "learning_rate": 3.7553041794025214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6069 }, { "completion_length": 752.25, "epoch": 1.6823725055432373, "grad_norm": 0.0, "kl": 0.15694229304790497, "learning_rate": 3.7549255592849575e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6070 }, { "completion_length": 733.0, "epoch": 1.682649667405765, "grad_norm": 0.0, "kl": 1.1704286336898804, "learning_rate": 3.754546900684047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6071 }, { "completion_length": 730.25, "epoch": 1.6829268292682928, "grad_norm": 0.0, "kl": 941.9754028320312, "learning_rate": 3.7541682036114023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6072 }, { "completion_length": 865.25, "epoch": 1.6832039911308203, "grad_norm": 0.0, "kl": 0.17123118042945862, "learning_rate": 3.753789468078636e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6073 }, { "completion_length": 661.0, "epoch": 1.6834811529933482, "grad_norm": 0.8533766865730286, "kl": 68.6623306274414, "learning_rate": 3.7534106940973626e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6074 }, { "completion_length": 736.0, "epoch": 1.6837583148558757, "grad_norm": 0.0, "kl": 0.15776145458221436, "learning_rate": 3.7530318816791966e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6075 }, { "completion_length": 672.5, "epoch": 1.6840354767184036, "grad_norm": 0.0, "kl": 0.43421873450279236, "learning_rate": 3.7526530308357557e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6076 }, { "completion_length": 644.5, "epoch": 1.6843126385809313, "grad_norm": 0.0, "kl": 0.18860167264938354, "learning_rate": 3.7522741415786562e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6077 }, { "completion_length": 613.75, "epoch": 1.684589800443459, "grad_norm": 0.0, "kl": 0.18744492530822754, "learning_rate": 3.75189521391952e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6078 }, { "completion_length": 726.5, "epoch": 1.6848669623059866, "grad_norm": 0.0, "kl": 0.17820270359516144, "learning_rate": 3.7515162478699636e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6079 }, { "completion_length": 719.25, "epoch": 1.6851441241685143, "grad_norm": 0.0, "kl": 0.18748365342617035, "learning_rate": 3.751137243441611e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6080 }, { "completion_length": 757.25, "epoch": 1.6854212860310422, "grad_norm": 0.0, "kl": 0.2056816816329956, "learning_rate": 3.7507582006460834e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6081 }, { "completion_length": 650.5, "epoch": 1.6856984478935697, "grad_norm": 0.0, "kl": 0.45761626958847046, "learning_rate": 3.7503791194950047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6082 }, { "completion_length": 618.25, "epoch": 1.6859756097560976, "grad_norm": 0.0, "kl": 0.26557019352912903, "learning_rate": 3.7500000000000005e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6083 }, { "completion_length": 705.25, "epoch": 1.6862527716186253, "grad_norm": 0.0, "kl": 0.20918835699558258, "learning_rate": 3.7496208421726953e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6084 }, { "completion_length": 560.25, "epoch": 1.686529933481153, "grad_norm": 0.0, "kl": 0.2573930025100708, "learning_rate": 3.749241646024718e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6085 }, { "completion_length": 809.5, "epoch": 1.6868070953436807, "grad_norm": 0.0, "kl": 0.18989312648773193, "learning_rate": 3.7488624115676954e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6086 }, { "completion_length": 632.25, "epoch": 1.6870842572062084, "grad_norm": 0.0, "kl": 0.15465998649597168, "learning_rate": 3.7484831388132586e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6087 }, { "completion_length": 672.0, "epoch": 1.6873614190687363, "grad_norm": 0.0, "kl": 0.36596813797950745, "learning_rate": 3.7481038277730367e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6088 }, { "completion_length": 733.0, "epoch": 1.6876385809312637, "grad_norm": 0.0, "kl": 0.20809964835643768, "learning_rate": 3.747724478458663e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6089 }, { "completion_length": 647.75, "epoch": 1.6879157427937916, "grad_norm": 0.0, "kl": 0.20997731387615204, "learning_rate": 3.7473450908817697e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6090 }, { "completion_length": 715.0, "epoch": 1.6881929046563193, "grad_norm": 0.0, "kl": 0.20563140511512756, "learning_rate": 3.7469656650539913e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6091 }, { "completion_length": 792.5, "epoch": 1.688470066518847, "grad_norm": 0.0, "kl": 0.18530943989753723, "learning_rate": 3.7465862009869625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6092 }, { "completion_length": 664.75, "epoch": 1.6887472283813747, "grad_norm": 0.0, "kl": 0.1679919809103012, "learning_rate": 3.7462066986923217e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6093 }, { "completion_length": 657.25, "epoch": 1.6890243902439024, "grad_norm": 0.0, "kl": 0.19909459352493286, "learning_rate": 3.7458271581817053e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6094 }, { "completion_length": 595.5, "epoch": 1.6893015521064303, "grad_norm": 0.0, "kl": 0.201203852891922, "learning_rate": 3.745447579466752e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6095 }, { "completion_length": 682.0, "epoch": 1.6895787139689578, "grad_norm": 0.0, "kl": 0.17921432852745056, "learning_rate": 3.7450679625591023e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6096 }, { "completion_length": 691.25, "epoch": 1.6898558758314857, "grad_norm": 0.0, "kl": 0.6021597981452942, "learning_rate": 3.744688307470399e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6097 }, { "completion_length": 662.0, "epoch": 1.6901330376940134, "grad_norm": 0.4674912691116333, "kl": 177162.109375, "learning_rate": 3.7443086142122813e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6098 }, { "completion_length": 752.5, "epoch": 1.690410199556541, "grad_norm": 0.0, "kl": 0.16613613069057465, "learning_rate": 3.743928882796395e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6099 }, { "completion_length": 753.25, "epoch": 1.6906873614190687, "grad_norm": 0.0, "kl": 0.15225321054458618, "learning_rate": 3.743549113234385e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6100 }, { "completion_length": 697.5, "epoch": 1.6909645232815964, "grad_norm": 0.0, "kl": 10513.306640625, "learning_rate": 3.7431693055378964e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6101 }, { "completion_length": 632.25, "epoch": 1.6912416851441243, "grad_norm": 0.0, "kl": 0.1738973706960678, "learning_rate": 3.7427894597185766e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6102 }, { "completion_length": 718.25, "epoch": 1.6915188470066518, "grad_norm": 0.0, "kl": 0.19548924267292023, "learning_rate": 3.742409575788074e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6103 }, { "completion_length": 636.75, "epoch": 1.6917960088691797, "grad_norm": 0.0, "kl": 9.59924030303955, "learning_rate": 3.742029653758039e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6104 }, { "completion_length": 691.75, "epoch": 1.6920731707317072, "grad_norm": 0.0, "kl": 0.25916439294815063, "learning_rate": 3.7416496936401198e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6105 }, { "completion_length": 694.25, "epoch": 1.692350332594235, "grad_norm": 0.0, "kl": 0.5198821425437927, "learning_rate": 3.7412696954459704e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6106 }, { "completion_length": 635.5, "epoch": 1.6926274944567627, "grad_norm": 0.0, "kl": 0.1896592676639557, "learning_rate": 3.7408896591872416e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6107 }, { "completion_length": 781.25, "epoch": 1.6929046563192904, "grad_norm": 0.0, "kl": 0.16996559500694275, "learning_rate": 3.7405095848755907e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6108 }, { "completion_length": 605.0, "epoch": 1.6931818181818183, "grad_norm": 0.0, "kl": 0.390671044588089, "learning_rate": 3.7401294725226707e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6109 }, { "completion_length": 720.5, "epoch": 1.6934589800443458, "grad_norm": 0.0, "kl": 0.2458242028951645, "learning_rate": 3.739749322140138e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6110 }, { "completion_length": 740.75, "epoch": 1.6937361419068737, "grad_norm": 0.0, "kl": 0.215727299451828, "learning_rate": 3.7393691337396515e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6111 }, { "completion_length": 805.0, "epoch": 1.6940133037694012, "grad_norm": 0.0, "kl": 152.60240173339844, "learning_rate": 3.7389889073328693e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6112 }, { "completion_length": 649.75, "epoch": 1.694290465631929, "grad_norm": 1.196484923362732, "kl": 2.8553502559661865, "learning_rate": 3.738608642931451e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6113 }, { "completion_length": 590.25, "epoch": 1.6945676274944568, "grad_norm": 0.0, "kl": 0.45048224925994873, "learning_rate": 3.7382283405470587e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6114 }, { "completion_length": 647.0, "epoch": 1.6948447893569845, "grad_norm": 0.0, "kl": 0.1801983118057251, "learning_rate": 3.737848000191353e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6115 }, { "completion_length": 742.75, "epoch": 1.6951219512195121, "grad_norm": 0.41278740763664246, "kl": 0.20086967945098877, "learning_rate": 3.7374676218759997e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6116 }, { "completion_length": 612.25, "epoch": 1.6953991130820398, "grad_norm": 0.0, "kl": 0.31373798847198486, "learning_rate": 3.7370872056126617e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6117 }, { "completion_length": 701.25, "epoch": 1.6956762749445677, "grad_norm": 0.3935609459877014, "kl": 0.16074392199516296, "learning_rate": 3.736706751413005e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6118 }, { "completion_length": 719.0, "epoch": 1.6959534368070952, "grad_norm": 0.0, "kl": 0.26332876086235046, "learning_rate": 3.7363262592886974e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6119 }, { "completion_length": 782.75, "epoch": 1.6962305986696231, "grad_norm": 0.6300601959228516, "kl": 4.1495161056518555, "learning_rate": 3.7359457292514056e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6120 }, { "completion_length": 785.0, "epoch": 1.6965077605321508, "grad_norm": 0.0, "kl": 0.15324215590953827, "learning_rate": 3.7355651613128004e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6121 }, { "completion_length": 631.0, "epoch": 1.6967849223946785, "grad_norm": 0.0, "kl": 0.1910133808851242, "learning_rate": 3.7351845554845505e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6122 }, { "completion_length": 687.75, "epoch": 1.6970620842572062, "grad_norm": 0.0, "kl": 0.19158579409122467, "learning_rate": 3.73480391177833e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6123 }, { "completion_length": 685.25, "epoch": 1.6973392461197339, "grad_norm": 0.0, "kl": 0.23198473453521729, "learning_rate": 3.7344232302058086e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6124 }, { "completion_length": 584.75, "epoch": 1.6976164079822618, "grad_norm": 0.0, "kl": 0.2929691970348358, "learning_rate": 3.7340425107786634e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6125 }, { "completion_length": 726.0, "epoch": 1.6978935698447892, "grad_norm": 0.0, "kl": 0.15629716217517853, "learning_rate": 3.7336617535085666e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6126 }, { "completion_length": 723.75, "epoch": 1.6981707317073171, "grad_norm": 0.0, "kl": 1.103855848312378, "learning_rate": 3.7332809584071968e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6127 }, { "completion_length": 626.5, "epoch": 1.6984478935698448, "grad_norm": 0.8592762351036072, "kl": 4454.0693359375, "learning_rate": 3.7329001254862297e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6128 }, { "completion_length": 680.5, "epoch": 1.6987250554323725, "grad_norm": 2.0429413318634033, "kl": 8.782163619995117, "learning_rate": 3.732519254757344e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6129 }, { "completion_length": 661.0, "epoch": 1.6990022172949002, "grad_norm": 0.0, "kl": 0.18685366213321686, "learning_rate": 3.7321383462322202e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6130 }, { "completion_length": 721.75, "epoch": 1.6992793791574279, "grad_norm": 0.0, "kl": 0.1755474954843521, "learning_rate": 3.7317573999225397e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6131 }, { "completion_length": 815.0, "epoch": 1.6995565410199558, "grad_norm": 0.0, "kl": 0.18468132615089417, "learning_rate": 3.7313764158399827e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6132 }, { "completion_length": 671.0, "epoch": 1.6998337028824833, "grad_norm": 0.0, "kl": 0.1751091480255127, "learning_rate": 3.730995393996234e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6133 }, { "completion_length": 741.0, "epoch": 1.7001108647450112, "grad_norm": 0.0, "kl": 0.9059382081031799, "learning_rate": 3.7306143344029776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6134 }, { "completion_length": 610.25, "epoch": 1.7003880266075388, "grad_norm": 0.4896615743637085, "kl": 0.1832028478384018, "learning_rate": 3.7302332370718988e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6135 }, { "completion_length": 683.25, "epoch": 1.7006651884700665, "grad_norm": 0.0, "kl": 0.16931326687335968, "learning_rate": 3.729852102014684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6136 }, { "completion_length": 788.25, "epoch": 1.7009423503325942, "grad_norm": 0.0, "kl": 0.16441501677036285, "learning_rate": 3.7294709292430205e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6137 }, { "completion_length": 667.75, "epoch": 1.701219512195122, "grad_norm": 0.0, "kl": 0.1970599740743637, "learning_rate": 3.729089718768599e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6138 }, { "completion_length": 714.0, "epoch": 1.7014966740576498, "grad_norm": 0.3871411383152008, "kl": 3614.6005859375, "learning_rate": 3.7287084706031086e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6139 }, { "completion_length": 604.25, "epoch": 1.7017738359201773, "grad_norm": 0.0, "kl": 0.184734508395195, "learning_rate": 3.7283271847582405e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6140 }, { "completion_length": 594.75, "epoch": 1.7020509977827052, "grad_norm": 0.0, "kl": 0.17223288118839264, "learning_rate": 3.7279458612456876e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6141 }, { "completion_length": 661.25, "epoch": 1.7023281596452327, "grad_norm": 0.0, "kl": 0.15935930609703064, "learning_rate": 3.727564500077143e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6142 }, { "completion_length": 751.5, "epoch": 1.7026053215077606, "grad_norm": 0.0, "kl": 0.13640518486499786, "learning_rate": 3.7271831012643023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6143 }, { "completion_length": 690.0, "epoch": 1.7028824833702882, "grad_norm": 0.0, "kl": 0.45013976097106934, "learning_rate": 3.72680166481886e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6144 }, { "completion_length": 691.0, "epoch": 1.703159645232816, "grad_norm": 0.0, "kl": 0.6638867855072021, "learning_rate": 3.7264201907525144e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6145 }, { "completion_length": 766.0, "epoch": 1.7034368070953438, "grad_norm": 0.0, "kl": 0.3033340275287628, "learning_rate": 3.7260386790769633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6146 }, { "completion_length": 952.5, "epoch": 1.7037139689578713, "grad_norm": 0.0, "kl": 0.13238567113876343, "learning_rate": 3.7256571298039063e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6147 }, { "completion_length": 656.75, "epoch": 1.7039911308203992, "grad_norm": 0.0, "kl": 0.20494653284549713, "learning_rate": 3.7252755429450437e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6148 }, { "completion_length": 731.5, "epoch": 1.7042682926829267, "grad_norm": 0.0, "kl": 0.20418700575828552, "learning_rate": 3.724893918512077e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6149 }, { "completion_length": 763.25, "epoch": 1.7045454545454546, "grad_norm": 0.3317970037460327, "kl": 0.16660630702972412, "learning_rate": 3.7245122565167092e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6150 }, { "completion_length": 555.0, "epoch": 1.7048226164079823, "grad_norm": 0.0, "kl": 0.16945880651474, "learning_rate": 3.724130556970645e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6151 }, { "completion_length": 726.75, "epoch": 1.70509977827051, "grad_norm": 0.0, "kl": 0.1552693396806717, "learning_rate": 3.7237488198855875e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6152 }, { "completion_length": 794.0, "epoch": 1.7053769401330376, "grad_norm": 0.0, "kl": 0.15199972689151764, "learning_rate": 3.7233670452732456e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6153 }, { "completion_length": 610.0, "epoch": 1.7056541019955653, "grad_norm": 0.44965678453445435, "kl": 26021.796875, "learning_rate": 3.722985233145325e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6154 }, { "completion_length": 701.75, "epoch": 1.7059312638580932, "grad_norm": 0.0, "kl": 0.5182908773422241, "learning_rate": 3.722603383513535e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6155 }, { "completion_length": 664.0, "epoch": 1.7062084257206207, "grad_norm": 0.0, "kl": 0.22231309115886688, "learning_rate": 3.7222214963895854e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6156 }, { "completion_length": 710.75, "epoch": 1.7064855875831486, "grad_norm": 0.0, "kl": 0.1707703024148941, "learning_rate": 3.721839571785186e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6157 }, { "completion_length": 875.0, "epoch": 1.7067627494456763, "grad_norm": 0.5040264129638672, "kl": 0.17918510735034943, "learning_rate": 3.7214576097120503e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6158 }, { "completion_length": 766.5, "epoch": 1.707039911308204, "grad_norm": 0.0, "kl": 0.1979292631149292, "learning_rate": 3.7210756101818907e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6159 }, { "completion_length": 648.5, "epoch": 1.7073170731707317, "grad_norm": 0.0, "kl": 0.28289794921875, "learning_rate": 3.7206935732064216e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6160 }, { "completion_length": 645.25, "epoch": 1.7075942350332594, "grad_norm": 0.0, "kl": 0.17548981308937073, "learning_rate": 3.720311498797359e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6161 }, { "completion_length": 664.75, "epoch": 1.7078713968957873, "grad_norm": 0.0, "kl": 0.21988020837306976, "learning_rate": 3.7199293869664183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6162 }, { "completion_length": 753.5, "epoch": 1.7081485587583147, "grad_norm": 0.0, "kl": 0.24288691580295563, "learning_rate": 3.719547237725319e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6163 }, { "completion_length": 644.5, "epoch": 1.7084257206208426, "grad_norm": 0.0, "kl": 0.2540733516216278, "learning_rate": 3.719165051085778e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6164 }, { "completion_length": 613.75, "epoch": 1.7087028824833703, "grad_norm": 0.0, "kl": 0.16566787660121918, "learning_rate": 3.718782827059517e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6165 }, { "completion_length": 661.0, "epoch": 1.708980044345898, "grad_norm": 0.0, "kl": 217.25563049316406, "learning_rate": 3.7184005656582573e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6166 }, { "completion_length": 663.25, "epoch": 1.7092572062084257, "grad_norm": 0.0, "kl": 0.17872612178325653, "learning_rate": 3.7180182668937203e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6167 }, { "completion_length": 779.0, "epoch": 1.7095343680709534, "grad_norm": 0.0, "kl": 0.4060419201850891, "learning_rate": 3.7176359307776295e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6168 }, { "completion_length": 855.5, "epoch": 1.7098115299334813, "grad_norm": 0.4027949571609497, "kl": 0.14442147314548492, "learning_rate": 3.71725355732171e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6169 }, { "completion_length": 764.5, "epoch": 1.7100886917960088, "grad_norm": 0.0, "kl": 0.15360119938850403, "learning_rate": 3.716871146537688e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6170 }, { "completion_length": 707.25, "epoch": 1.7103658536585367, "grad_norm": 0.5506470799446106, "kl": 2810.83154296875, "learning_rate": 3.7164886984372894e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6171 }, { "completion_length": 710.75, "epoch": 1.7106430155210643, "grad_norm": 0.0, "kl": 0.1602822244167328, "learning_rate": 3.7161062130322434e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6172 }, { "completion_length": 755.75, "epoch": 1.710920177383592, "grad_norm": 0.0, "kl": 0.2131505161523819, "learning_rate": 3.7157236903342786e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6173 }, { "completion_length": 713.5, "epoch": 1.7111973392461197, "grad_norm": 0.0, "kl": 0.2113078087568283, "learning_rate": 3.7153411303551255e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6174 }, { "completion_length": 849.25, "epoch": 1.7114745011086474, "grad_norm": 0.0, "kl": 0.1828165203332901, "learning_rate": 3.714958533106515e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6175 }, { "completion_length": 805.0, "epoch": 1.7117516629711753, "grad_norm": 0.0, "kl": 0.24708007276058197, "learning_rate": 3.7145758986001817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6176 }, { "completion_length": 811.25, "epoch": 1.7120288248337028, "grad_norm": 0.0, "kl": 0.1886271983385086, "learning_rate": 3.714193226847857e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6177 }, { "completion_length": 883.75, "epoch": 1.7123059866962307, "grad_norm": 0.34072718024253845, "kl": 0.15588684380054474, "learning_rate": 3.7138105178612778e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6178 }, { "completion_length": 774.5, "epoch": 1.7125831485587582, "grad_norm": 0.0, "kl": 0.3676105737686157, "learning_rate": 3.713427771652179e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6179 }, { "completion_length": 666.25, "epoch": 1.712860310421286, "grad_norm": 0.0, "kl": 0.42604562640190125, "learning_rate": 3.713044988232298e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6180 }, { "completion_length": 668.75, "epoch": 1.7131374722838137, "grad_norm": 0.0, "kl": 0.14881183207035065, "learning_rate": 3.7126621676133735e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6181 }, { "completion_length": 606.25, "epoch": 1.7134146341463414, "grad_norm": 0.6310132145881653, "kl": 166.5265655517578, "learning_rate": 3.7122793098071443e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6182 }, { "completion_length": 788.0, "epoch": 1.7136917960088693, "grad_norm": 0.0, "kl": 0.18628041446208954, "learning_rate": 3.711896414825352e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6183 }, { "completion_length": 658.5, "epoch": 1.7139689578713968, "grad_norm": 0.0, "kl": 0.22527244687080383, "learning_rate": 3.7115134826797384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6184 }, { "completion_length": 715.5, "epoch": 1.7142461197339247, "grad_norm": 0.0, "kl": 0.19423170387744904, "learning_rate": 3.7111305133820457e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6185 }, { "completion_length": 687.25, "epoch": 1.7145232815964522, "grad_norm": 0.0, "kl": 0.20371240377426147, "learning_rate": 3.710747506944018e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6186 }, { "completion_length": 742.75, "epoch": 1.71480044345898, "grad_norm": 0.0, "kl": 0.14975938200950623, "learning_rate": 3.7103644633774015e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6187 }, { "completion_length": 678.75, "epoch": 1.7150776053215078, "grad_norm": 0.0, "kl": 0.15696364641189575, "learning_rate": 3.709981382693942e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6188 }, { "completion_length": 696.75, "epoch": 1.7153547671840355, "grad_norm": 0.0, "kl": 0.22691719233989716, "learning_rate": 3.7095982649053864e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6189 }, { "completion_length": 694.25, "epoch": 1.7156319290465631, "grad_norm": 0.0, "kl": 0.5733414888381958, "learning_rate": 3.7092151100234835e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6190 }, { "completion_length": 704.75, "epoch": 1.7159090909090908, "grad_norm": 0.0, "kl": 0.21942421793937683, "learning_rate": 3.7088319180599842e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6191 }, { "completion_length": 777.5, "epoch": 1.7161862527716187, "grad_norm": 0.0, "kl": 0.1902778297662735, "learning_rate": 3.708448689026638e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6192 }, { "completion_length": 776.0, "epoch": 1.7164634146341462, "grad_norm": 0.0, "kl": 0.16962264478206635, "learning_rate": 3.708065422935198e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6193 }, { "completion_length": 720.25, "epoch": 1.716740576496674, "grad_norm": 0.0, "kl": 54.085411071777344, "learning_rate": 3.707682119797417e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6194 }, { "completion_length": 697.25, "epoch": 1.7170177383592018, "grad_norm": 0.0, "kl": 0.251009076833725, "learning_rate": 3.7072987796250493e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6195 }, { "completion_length": 670.5, "epoch": 1.7172949002217295, "grad_norm": 0.46182042360305786, "kl": 72034.359375, "learning_rate": 3.706915402429849e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6196 }, { "completion_length": 837.0, "epoch": 1.7175720620842572, "grad_norm": 0.32822439074516296, "kl": 0.1263355314731598, "learning_rate": 3.706531988223575e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6197 }, { "completion_length": 778.0, "epoch": 1.7178492239467849, "grad_norm": 0.0, "kl": 0.18244101107120514, "learning_rate": 3.706148537017984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6198 }, { "completion_length": 696.25, "epoch": 1.7181263858093128, "grad_norm": 0.0, "kl": 0.36275139451026917, "learning_rate": 3.7057650488248343e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6199 }, { "completion_length": 814.0, "epoch": 1.7184035476718402, "grad_norm": 0.0, "kl": 0.17297258973121643, "learning_rate": 3.7053815236558865e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6200 }, { "completion_length": 756.0, "epoch": 1.7186807095343681, "grad_norm": 1.2307006120681763, "kl": 0.23996321856975555, "learning_rate": 3.704997961522902e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6201 }, { "completion_length": 1248.75, "epoch": 1.7189578713968958, "grad_norm": 0.0, "kl": 0.8549677729606628, "learning_rate": 3.7046143624376427e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6202 }, { "completion_length": 653.25, "epoch": 1.7192350332594235, "grad_norm": 0.0, "kl": 0.19222033023834229, "learning_rate": 3.7042307264118714e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6203 }, { "completion_length": 732.25, "epoch": 1.7195121951219512, "grad_norm": 0.0, "kl": 0.1877927929162979, "learning_rate": 3.703847053457354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6204 }, { "completion_length": 686.25, "epoch": 1.7197893569844789, "grad_norm": 0.0, "kl": 0.22038552165031433, "learning_rate": 3.703463343585855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6205 }, { "completion_length": 605.25, "epoch": 1.7200665188470068, "grad_norm": 0.0, "kl": 0.16129383444786072, "learning_rate": 3.7030795968091416e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6206 }, { "completion_length": 673.25, "epoch": 1.7203436807095343, "grad_norm": 0.5586047172546387, "kl": 67.81183624267578, "learning_rate": 3.7026958131389813e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6207 }, { "completion_length": 620.5, "epoch": 1.7206208425720622, "grad_norm": 0.0, "kl": 0.16965387761592865, "learning_rate": 3.702311992587144e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6208 }, { "completion_length": 755.5, "epoch": 1.7208980044345898, "grad_norm": 1.1184437274932861, "kl": 1389.1424560546875, "learning_rate": 3.701928135165399e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6209 }, { "completion_length": 819.25, "epoch": 1.7211751662971175, "grad_norm": 0.0, "kl": 0.13376250863075256, "learning_rate": 3.7015442408855183e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6210 }, { "completion_length": 727.5, "epoch": 1.7214523281596452, "grad_norm": 0.0, "kl": 0.636756956577301, "learning_rate": 3.701160309759274e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6211 }, { "completion_length": 570.5, "epoch": 1.721729490022173, "grad_norm": 0.0, "kl": 0.3749980628490448, "learning_rate": 3.700776341798439e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6212 }, { "completion_length": 854.0, "epoch": 1.7220066518847008, "grad_norm": 0.0, "kl": 0.16175241768360138, "learning_rate": 3.7003923370147894e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6213 }, { "completion_length": 662.0, "epoch": 1.7222838137472283, "grad_norm": 0.9228370189666748, "kl": 1680.073486328125, "learning_rate": 3.7000082954201007e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6214 }, { "completion_length": 737.0, "epoch": 1.7225609756097562, "grad_norm": 0.0, "kl": 0.18395812809467316, "learning_rate": 3.6996242170261485e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6215 }, { "completion_length": 648.5, "epoch": 1.7228381374722836, "grad_norm": 0.7319849729537964, "kl": 135.31163024902344, "learning_rate": 3.6992401018447123e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6216 }, { "completion_length": 681.5, "epoch": 1.7231152993348116, "grad_norm": 0.0, "kl": 0.16548363864421844, "learning_rate": 3.6988559498875707e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6217 }, { "completion_length": 695.75, "epoch": 1.7233924611973392, "grad_norm": 0.0, "kl": 0.1782829612493515, "learning_rate": 3.6984717611665044e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6218 }, { "completion_length": 718.5, "epoch": 1.723669623059867, "grad_norm": 0.0, "kl": 1.2618104219436646, "learning_rate": 3.6980875356932948e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6219 }, { "completion_length": 657.75, "epoch": 1.7239467849223948, "grad_norm": 0.0, "kl": 0.22461943328380585, "learning_rate": 3.697703273479725e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6220 }, { "completion_length": 705.75, "epoch": 1.7242239467849223, "grad_norm": 0.0, "kl": 34.515018463134766, "learning_rate": 3.6973189745375772e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6221 }, { "completion_length": 617.25, "epoch": 1.7245011086474502, "grad_norm": 0.0, "kl": 0.20077331364154816, "learning_rate": 3.6969346388786376e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6222 }, { "completion_length": 680.0, "epoch": 1.7247782705099777, "grad_norm": 0.0, "kl": 0.17977143824100494, "learning_rate": 3.6965502665146916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6223 }, { "completion_length": 634.75, "epoch": 1.7250554323725056, "grad_norm": 0.0, "kl": 0.16449639201164246, "learning_rate": 3.6961658574575265e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6224 }, { "completion_length": 735.0, "epoch": 1.7253325942350333, "grad_norm": 0.43345579504966736, "kl": 0.14943701028823853, "learning_rate": 3.6957814117189305e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6225 }, { "completion_length": 622.0, "epoch": 1.725609756097561, "grad_norm": 0.0, "kl": 79.82665252685547, "learning_rate": 3.695396929310693e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6226 }, { "completion_length": 1091.0, "epoch": 1.7258869179600886, "grad_norm": 0.24867473542690277, "kl": 5917.5869140625, "learning_rate": 3.6950124102446037e-06, "loss": 0.0, "reward": 2.09375, "reward_std": 2.7336158752441406, "rewards/confident_score_func": 0.25, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 6227 }, { "completion_length": 796.25, "epoch": 1.7261640798226163, "grad_norm": 0.38140609860420227, "kl": 43014.94140625, "learning_rate": 3.6946278545324553e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6228 }, { "completion_length": 659.0, "epoch": 1.7264412416851442, "grad_norm": 0.0, "kl": 1.4807811975479126, "learning_rate": 3.694243262186041e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6229 }, { "completion_length": 747.5, "epoch": 1.7267184035476717, "grad_norm": 0.3718612790107727, "kl": 639.82275390625, "learning_rate": 3.6938586332171527e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6230 }, { "completion_length": 633.5, "epoch": 1.7269955654101996, "grad_norm": 0.0, "kl": 0.18714067339897156, "learning_rate": 3.693473967637587e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6231 }, { "completion_length": 1024.0, "epoch": 1.7272727272727273, "grad_norm": 0.0, "kl": 0.13955412805080414, "learning_rate": 3.6930892654591395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6232 }, { "completion_length": 748.25, "epoch": 1.727549889135255, "grad_norm": 0.35661938786506653, "kl": 44680.03125, "learning_rate": 3.692704526693607e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6233 }, { "completion_length": 678.5, "epoch": 1.7278270509977827, "grad_norm": 0.0, "kl": 0.15306980907917023, "learning_rate": 3.692319751352788e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6234 }, { "completion_length": 626.25, "epoch": 1.7281042128603104, "grad_norm": 0.0, "kl": 0.7694947719573975, "learning_rate": 3.6919349394484827e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6235 }, { "completion_length": 713.5, "epoch": 1.7283813747228383, "grad_norm": 0.0, "kl": 0.12914051115512848, "learning_rate": 3.6915500909924913e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6236 }, { "completion_length": 712.0, "epoch": 1.7286585365853657, "grad_norm": 0.0, "kl": 0.16124767065048218, "learning_rate": 3.6911652059966145e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6237 }, { "completion_length": 663.5, "epoch": 1.7289356984478936, "grad_norm": 0.0, "kl": 0.17394904792308807, "learning_rate": 3.690780284472657e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6238 }, { "completion_length": 706.75, "epoch": 1.7292128603104213, "grad_norm": 0.0, "kl": 0.19324228167533875, "learning_rate": 3.690395326432421e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6239 }, { "completion_length": 640.75, "epoch": 1.729490022172949, "grad_norm": 1.0518568754196167, "kl": 0.980936586856842, "learning_rate": 3.6900103318877123e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6240 }, { "completion_length": 740.25, "epoch": 1.7297671840354767, "grad_norm": 0.0, "kl": 8556.63671875, "learning_rate": 3.689625300850337e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6241 }, { "completion_length": 679.75, "epoch": 1.7300443458980044, "grad_norm": 0.34477391839027405, "kl": 0.18054214119911194, "learning_rate": 3.6892402333321025e-06, "loss": -0.0, "reward": 4.46875, "reward_std": 2.5625, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 6242 }, { "completion_length": 776.5, "epoch": 1.7303215077605323, "grad_norm": 0.0, "kl": 0.3990035653114319, "learning_rate": 3.688855129344817e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6243 }, { "completion_length": 723.0, "epoch": 1.7305986696230597, "grad_norm": 0.0, "kl": 0.18454428017139435, "learning_rate": 3.6884699889002905e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6244 }, { "completion_length": 729.25, "epoch": 1.7308758314855877, "grad_norm": 0.0, "kl": 5236.873046875, "learning_rate": 3.6880848120103337e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6245 }, { "completion_length": 757.0, "epoch": 1.7311529933481153, "grad_norm": 0.0, "kl": 0.1468620002269745, "learning_rate": 3.687699598686758e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6246 }, { "completion_length": 816.75, "epoch": 1.731430155210643, "grad_norm": 0.0, "kl": 0.13603757321834564, "learning_rate": 3.6873143489413753e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6247 }, { "completion_length": 735.0, "epoch": 1.7317073170731707, "grad_norm": 0.0, "kl": 0.9305853247642517, "learning_rate": 3.6869290627860012e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6248 }, { "completion_length": 813.0, "epoch": 1.7319844789356984, "grad_norm": 0.0, "kl": 0.1605769693851471, "learning_rate": 3.68654374023245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6249 }, { "completion_length": 662.75, "epoch": 1.7322616407982263, "grad_norm": 0.0, "kl": 0.15998266637325287, "learning_rate": 3.686158381292538e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6250 }, { "completion_length": 602.75, "epoch": 1.7325388026607538, "grad_norm": 0.0, "kl": 0.22620820999145508, "learning_rate": 3.685772985978084e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6251 }, { "completion_length": 683.0, "epoch": 1.7328159645232817, "grad_norm": 0.3773311972618103, "kl": 3.5250587463378906, "learning_rate": 3.6853875543009045e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6252 }, { "completion_length": 633.25, "epoch": 1.7330931263858091, "grad_norm": 0.0, "kl": 4014.66015625, "learning_rate": 3.6850020862728196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6253 }, { "completion_length": 588.75, "epoch": 1.733370288248337, "grad_norm": 0.0, "kl": 0.15575814247131348, "learning_rate": 3.6846165819056508e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6254 }, { "completion_length": 785.75, "epoch": 1.7336474501108647, "grad_norm": 0.0, "kl": 0.18198078870773315, "learning_rate": 3.6842310412112188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6255 }, { "completion_length": 569.0, "epoch": 1.7339246119733924, "grad_norm": 0.0, "kl": 0.21185770630836487, "learning_rate": 3.6838454642013465e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6256 }, { "completion_length": 797.75, "epoch": 1.7342017738359203, "grad_norm": 0.0, "kl": 0.13401612639427185, "learning_rate": 3.6834598508878593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6257 }, { "completion_length": 579.75, "epoch": 1.7344789356984478, "grad_norm": 0.0, "kl": 0.16556398570537567, "learning_rate": 3.6830742012825815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6258 }, { "completion_length": 622.5, "epoch": 1.7347560975609757, "grad_norm": 0.0, "kl": 0.1530838906764984, "learning_rate": 3.682688515397339e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6259 }, { "completion_length": 788.5, "epoch": 1.7350332594235032, "grad_norm": 0.0, "kl": 0.17011059820652008, "learning_rate": 3.6823027932439596e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6260 }, { "completion_length": 687.75, "epoch": 1.735310421286031, "grad_norm": 0.0, "kl": 0.16034361720085144, "learning_rate": 3.6819170348342724e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6261 }, { "completion_length": 596.25, "epoch": 1.7355875831485588, "grad_norm": 0.0, "kl": 0.697814404964447, "learning_rate": 3.6815312401801064e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6262 }, { "completion_length": 646.25, "epoch": 1.7358647450110865, "grad_norm": 0.0, "kl": 0.16148923337459564, "learning_rate": 3.681145409293291e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6263 }, { "completion_length": 703.25, "epoch": 1.7361419068736141, "grad_norm": 0.0, "kl": 0.3593437373638153, "learning_rate": 3.6807595421856607e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6264 }, { "completion_length": 618.5, "epoch": 1.7364190687361418, "grad_norm": 0.0, "kl": 0.15894635021686554, "learning_rate": 3.680373638869047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6265 }, { "completion_length": 572.5, "epoch": 1.7366962305986697, "grad_norm": 0.517145574092865, "kl": 23797.998046875, "learning_rate": 3.679987699355283e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6266 }, { "completion_length": 741.5, "epoch": 1.7369733924611972, "grad_norm": 0.0, "kl": 0.14603465795516968, "learning_rate": 3.679601723656205e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6267 }, { "completion_length": 724.0, "epoch": 1.737250554323725, "grad_norm": 0.0, "kl": 0.13423047959804535, "learning_rate": 3.679215711783651e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6268 }, { "completion_length": 662.0, "epoch": 1.7375277161862528, "grad_norm": 0.6314886808395386, "kl": 546.5376586914062, "learning_rate": 3.6788296637494547e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6269 }, { "completion_length": 697.75, "epoch": 1.7378048780487805, "grad_norm": 0.0, "kl": 0.17410771548748016, "learning_rate": 3.678443579565457e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6270 }, { "completion_length": 684.0, "epoch": 1.7380820399113082, "grad_norm": 0.0, "kl": 0.17244455218315125, "learning_rate": 3.678057459243497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6271 }, { "completion_length": 935.0, "epoch": 1.7383592017738358, "grad_norm": 0.0, "kl": 0.12537644803524017, "learning_rate": 3.6776713027954154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6272 }, { "completion_length": 638.75, "epoch": 1.7386363636363638, "grad_norm": 0.0, "kl": 0.141434445977211, "learning_rate": 3.677285110233053e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6273 }, { "completion_length": 659.75, "epoch": 1.7389135254988912, "grad_norm": 0.0, "kl": 0.15320704877376556, "learning_rate": 3.676898881568255e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6274 }, { "completion_length": 615.5, "epoch": 1.7391906873614191, "grad_norm": 0.0, "kl": 0.1668592244386673, "learning_rate": 3.6765126168128635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6275 }, { "completion_length": 718.25, "epoch": 1.7394678492239468, "grad_norm": 0.0, "kl": 0.2926475405693054, "learning_rate": 3.6761263159787242e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6276 }, { "completion_length": 796.25, "epoch": 1.7397450110864745, "grad_norm": 0.0, "kl": 0.32719627022743225, "learning_rate": 3.6757399790776836e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6277 }, { "completion_length": 617.75, "epoch": 1.7400221729490022, "grad_norm": 0.0, "kl": 0.9572044610977173, "learning_rate": 3.6753536061215882e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6278 }, { "completion_length": 729.0, "epoch": 1.7402993348115299, "grad_norm": 0.0, "kl": 0.14118856191635132, "learning_rate": 3.6749671971222873e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6279 }, { "completion_length": 698.25, "epoch": 1.7405764966740578, "grad_norm": 0.0, "kl": 0.15459412336349487, "learning_rate": 3.6745807520916303e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6280 }, { "completion_length": 813.5, "epoch": 1.7408536585365852, "grad_norm": 0.0, "kl": 0.17808854579925537, "learning_rate": 3.674194271041468e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6281 }, { "completion_length": 782.75, "epoch": 1.7411308203991132, "grad_norm": 0.5843966603279114, "kl": 0.16203244030475616, "learning_rate": 3.6738077539836513e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6282 }, { "completion_length": 716.25, "epoch": 1.7414079822616408, "grad_norm": 0.0, "kl": 0.14296963810920715, "learning_rate": 3.6734212009300346e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6283 }, { "completion_length": 590.25, "epoch": 1.7416851441241685, "grad_norm": 0.0, "kl": 0.18268044292926788, "learning_rate": 3.6730346118924704e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6284 }, { "completion_length": 822.75, "epoch": 1.7419623059866962, "grad_norm": 0.5109179615974426, "kl": 20747.244140625, "learning_rate": 3.6726479868828146e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6285 }, { "completion_length": 655.75, "epoch": 1.742239467849224, "grad_norm": 0.0, "kl": 0.22913645207881927, "learning_rate": 3.6722613259129226e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6286 }, { "completion_length": 687.0, "epoch": 1.7425166297117518, "grad_norm": 0.0, "kl": 0.17591163516044617, "learning_rate": 3.671874628994653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6287 }, { "completion_length": 562.75, "epoch": 1.7427937915742793, "grad_norm": 1.0350978374481201, "kl": 0.7679610848426819, "learning_rate": 3.6714878961398626e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6288 }, { "completion_length": 757.75, "epoch": 1.7430709534368072, "grad_norm": 0.0, "kl": 0.1748802810907364, "learning_rate": 3.6711011273604124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6289 }, { "completion_length": 656.75, "epoch": 1.7433481152993349, "grad_norm": 0.0, "kl": 0.15013037621974945, "learning_rate": 3.670714322668162e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6290 }, { "completion_length": 617.25, "epoch": 1.7436252771618626, "grad_norm": 0.0, "kl": 0.1582535207271576, "learning_rate": 3.6703274820749736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6291 }, { "completion_length": 651.25, "epoch": 1.7439024390243902, "grad_norm": 0.0, "kl": 0.21386109292507172, "learning_rate": 3.6699406055927088e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6292 }, { "completion_length": 726.5, "epoch": 1.744179600886918, "grad_norm": 0.0, "kl": 0.1557488888502121, "learning_rate": 3.6695536932332344e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6293 }, { "completion_length": 678.5, "epoch": 1.7444567627494458, "grad_norm": 0.0, "kl": 0.3674596846103668, "learning_rate": 3.6691667450084114e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6294 }, { "completion_length": 801.0, "epoch": 1.7447339246119733, "grad_norm": 0.0, "kl": 0.14470486342906952, "learning_rate": 3.668779760930109e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6295 }, { "completion_length": 647.25, "epoch": 1.7450110864745012, "grad_norm": 0.0, "kl": 0.190164253115654, "learning_rate": 3.668392741010193e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6296 }, { "completion_length": 654.75, "epoch": 1.7452882483370287, "grad_norm": 0.0, "kl": 9.694084167480469, "learning_rate": 3.6680056852605317e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6297 }, { "completion_length": 767.5, "epoch": 1.7455654101995566, "grad_norm": 0.0, "kl": 0.15620331466197968, "learning_rate": 3.667618593692995e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6298 }, { "completion_length": 656.5, "epoch": 1.7458425720620843, "grad_norm": 0.0, "kl": 0.19379140436649323, "learning_rate": 3.667231466319454e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6299 }, { "completion_length": 644.0, "epoch": 1.746119733924612, "grad_norm": 0.0, "kl": 0.15330930054187775, "learning_rate": 3.666844303151779e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6300 }, { "completion_length": 657.0, "epoch": 1.7463968957871396, "grad_norm": 0.0, "kl": 0.15976285934448242, "learning_rate": 3.6664571042018427e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6301 }, { "completion_length": 749.75, "epoch": 1.7466740576496673, "grad_norm": 0.34789201617240906, "kl": 0.18165667355060577, "learning_rate": 3.66606986948152e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6302 }, { "completion_length": 637.0, "epoch": 1.7469512195121952, "grad_norm": 0.0, "kl": 0.15380336344242096, "learning_rate": 3.6656825990026847e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6303 }, { "completion_length": 673.0, "epoch": 1.7472283813747227, "grad_norm": 0.0, "kl": 0.17414933443069458, "learning_rate": 3.665295292777214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6304 }, { "completion_length": 598.5, "epoch": 1.7475055432372506, "grad_norm": 0.9353801012039185, "kl": 260.9443054199219, "learning_rate": 3.6649079508169833e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6305 }, { "completion_length": 562.0, "epoch": 1.7477827050997783, "grad_norm": 0.0, "kl": 0.2716572880744934, "learning_rate": 3.6645205731338725e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6306 }, { "completion_length": 794.75, "epoch": 1.748059866962306, "grad_norm": 0.4128887951374054, "kl": 11951.49609375, "learning_rate": 3.664133159739759e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6307 }, { "completion_length": 687.25, "epoch": 1.7483370288248337, "grad_norm": 0.0, "kl": 0.16241779923439026, "learning_rate": 3.663745710646525e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6308 }, { "completion_length": 623.25, "epoch": 1.7486141906873613, "grad_norm": 0.0, "kl": 0.16339008510112762, "learning_rate": 3.66335822586605e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6309 }, { "completion_length": 686.75, "epoch": 1.7488913525498893, "grad_norm": 2.2175238132476807, "kl": 4121.5322265625, "learning_rate": 3.6629707054102187e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6310 }, { "completion_length": 612.0, "epoch": 1.7491685144124167, "grad_norm": 0.0, "kl": 0.15267059206962585, "learning_rate": 3.6625831492909135e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6311 }, { "completion_length": 541.75, "epoch": 1.7494456762749446, "grad_norm": 0.0, "kl": 0.20863613486289978, "learning_rate": 3.6621955575200195e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6312 }, { "completion_length": 665.25, "epoch": 1.7497228381374723, "grad_norm": 1.4992543458938599, "kl": 0.3287965655326843, "learning_rate": 3.661807930109422e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6313 }, { "completion_length": 625.25, "epoch": 1.75, "grad_norm": 0.0, "kl": 0.16065341234207153, "learning_rate": 3.661420267071008e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6314 }, { "completion_length": 698.0, "epoch": 1.7502771618625277, "grad_norm": 0.0, "kl": 0.3860022723674774, "learning_rate": 3.6610325684166663e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6315 }, { "completion_length": 546.5, "epoch": 1.7505543237250554, "grad_norm": 1.1011942625045776, "kl": 208065.65625, "learning_rate": 3.660644834158286e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6316 }, { "completion_length": 690.25, "epoch": 1.7508314855875833, "grad_norm": 0.0, "kl": 0.15916049480438232, "learning_rate": 3.6602570643077555e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6317 }, { "completion_length": 621.0, "epoch": 1.7511086474501107, "grad_norm": 2.278589963912964, "kl": 190469.65625, "learning_rate": 3.659869258876968e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6318 }, { "completion_length": 582.75, "epoch": 1.7513858093126387, "grad_norm": 0.0, "kl": 0.2125379741191864, "learning_rate": 3.6594814178778155e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6319 }, { "completion_length": 710.0, "epoch": 1.7516629711751663, "grad_norm": 0.0, "kl": 0.19448065757751465, "learning_rate": 3.6590935413221906e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6320 }, { "completion_length": 633.75, "epoch": 1.751940133037694, "grad_norm": 0.0, "kl": 0.20947493612766266, "learning_rate": 3.6587056292219898e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6321 }, { "completion_length": 645.75, "epoch": 1.7522172949002217, "grad_norm": 0.0, "kl": 0.21292611956596375, "learning_rate": 3.658317681589106e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6322 }, { "completion_length": 766.25, "epoch": 1.7524944567627494, "grad_norm": 0.0, "kl": 0.19510318338871002, "learning_rate": 3.6579296984354386e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6323 }, { "completion_length": 662.25, "epoch": 1.7527716186252773, "grad_norm": 0.0, "kl": 0.3405977189540863, "learning_rate": 3.657541679772883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6324 }, { "completion_length": 553.25, "epoch": 1.7530487804878048, "grad_norm": 0.0, "kl": 0.1943880021572113, "learning_rate": 3.65715362561334e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6325 }, { "completion_length": 597.25, "epoch": 1.7533259423503327, "grad_norm": 0.0, "kl": 0.16938762366771698, "learning_rate": 3.656765535968709e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6326 }, { "completion_length": 573.75, "epoch": 1.7536031042128604, "grad_norm": 0.0, "kl": 0.1837325543165207, "learning_rate": 3.6563774108508915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6327 }, { "completion_length": 828.75, "epoch": 1.753880266075388, "grad_norm": 0.0, "kl": 0.15808896720409393, "learning_rate": 3.655989250271788e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6328 }, { "completion_length": 582.75, "epoch": 1.7541574279379157, "grad_norm": 0.0, "kl": 0.22117845714092255, "learning_rate": 3.655601054243304e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6329 }, { "completion_length": 764.75, "epoch": 1.7544345898004434, "grad_norm": 0.0, "kl": 0.19580857455730438, "learning_rate": 3.6552128227773428e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6330 }, { "completion_length": 577.0, "epoch": 1.7547117516629713, "grad_norm": 0.0, "kl": 0.26046091318130493, "learning_rate": 3.6548245558858092e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6331 }, { "completion_length": 614.0, "epoch": 1.7549889135254988, "grad_norm": 0.0, "kl": 0.21482256054878235, "learning_rate": 3.6544362535806104e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6332 }, { "completion_length": 625.25, "epoch": 1.7552660753880267, "grad_norm": 0.4612833857536316, "kl": 1136113.5, "learning_rate": 3.6540479158736547e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6333 }, { "completion_length": 612.5, "epoch": 1.7555432372505542, "grad_norm": 0.0, "kl": 0.23914800584316254, "learning_rate": 3.65365954277685e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6334 }, { "completion_length": 675.5, "epoch": 1.755820399113082, "grad_norm": 0.0, "kl": 1.8589438199996948, "learning_rate": 3.653271134302106e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6335 }, { "completion_length": 673.0, "epoch": 1.7560975609756098, "grad_norm": 0.0, "kl": 0.16071754693984985, "learning_rate": 3.652882690461334e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6336 }, { "completion_length": 643.0, "epoch": 1.7563747228381374, "grad_norm": 0.0, "kl": 0.22459661960601807, "learning_rate": 3.6524942112664457e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6337 }, { "completion_length": 688.5, "epoch": 1.7566518847006651, "grad_norm": 0.0, "kl": 0.17809146642684937, "learning_rate": 3.652105696729354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6338 }, { "completion_length": 837.25, "epoch": 1.7569290465631928, "grad_norm": 0.0, "kl": 0.171071395277977, "learning_rate": 3.651717146861973e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6339 }, { "completion_length": 621.25, "epoch": 1.7572062084257207, "grad_norm": 0.0, "kl": 0.2887056767940521, "learning_rate": 3.651328561676219e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6340 }, { "completion_length": 665.75, "epoch": 1.7574833702882482, "grad_norm": 0.0, "kl": 0.18987645208835602, "learning_rate": 3.650939941184007e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6341 }, { "completion_length": 523.0, "epoch": 1.757760532150776, "grad_norm": 0.0, "kl": 0.23135153949260712, "learning_rate": 3.6505512853972547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6342 }, { "completion_length": 602.25, "epoch": 1.7580376940133038, "grad_norm": 0.0, "kl": 0.24016772210597992, "learning_rate": 3.650162594327881e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6343 }, { "completion_length": 536.0, "epoch": 1.7583148558758315, "grad_norm": 0.0, "kl": 0.1821790486574173, "learning_rate": 3.6497738679878043e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6344 }, { "completion_length": 647.75, "epoch": 1.7585920177383592, "grad_norm": 0.0, "kl": 0.3156242370605469, "learning_rate": 3.6493851063889466e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6345 }, { "completion_length": 640.25, "epoch": 1.7588691796008868, "grad_norm": 0.0, "kl": 0.23978398740291595, "learning_rate": 3.648996309543229e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6346 }, { "completion_length": 722.0, "epoch": 1.7591463414634148, "grad_norm": 0.0, "kl": 0.14224699139595032, "learning_rate": 3.648607477462574e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6347 }, { "completion_length": 542.25, "epoch": 1.7594235033259422, "grad_norm": 0.0, "kl": 0.20825152099132538, "learning_rate": 3.6482186101589055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6348 }, { "completion_length": 616.25, "epoch": 1.7597006651884701, "grad_norm": 0.0, "kl": 80158.875, "learning_rate": 3.64782970764415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6349 }, { "completion_length": 602.0, "epoch": 1.7599778270509978, "grad_norm": 0.5011602640151978, "kl": 1279.1051025390625, "learning_rate": 3.647440769930231e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6350 }, { "completion_length": 640.5, "epoch": 1.7602549889135255, "grad_norm": 0.0, "kl": 0.20639155805110931, "learning_rate": 3.6470517970290775e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6351 }, { "completion_length": 567.25, "epoch": 1.7605321507760532, "grad_norm": 0.0, "kl": 0.3706102669239044, "learning_rate": 3.6466627889526163e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6352 }, { "completion_length": 615.0, "epoch": 1.7608093126385809, "grad_norm": 0.0, "kl": 0.4094788432121277, "learning_rate": 3.646273745712778e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6353 }, { "completion_length": 593.0, "epoch": 1.7610864745011088, "grad_norm": 0.0, "kl": 0.2378227710723877, "learning_rate": 3.6458846673214914e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6354 }, { "completion_length": 709.25, "epoch": 1.7613636363636362, "grad_norm": 0.0, "kl": 0.19381015002727509, "learning_rate": 3.64549555379069e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6355 }, { "completion_length": 563.5, "epoch": 1.7616407982261642, "grad_norm": 0.0, "kl": 0.21095818281173706, "learning_rate": 3.6451064051323043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6356 }, { "completion_length": 608.25, "epoch": 1.7619179600886918, "grad_norm": 0.0, "kl": 0.29351484775543213, "learning_rate": 3.6447172213582692e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6357 }, { "completion_length": 556.25, "epoch": 1.7621951219512195, "grad_norm": 0.0, "kl": 0.21643303334712982, "learning_rate": 3.644328002480518e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6358 }, { "completion_length": 609.0, "epoch": 1.7624722838137472, "grad_norm": 0.0, "kl": 4.191649436950684, "learning_rate": 3.643938748510989e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6359 }, { "completion_length": 624.75, "epoch": 1.762749445676275, "grad_norm": 0.0, "kl": 0.2169201374053955, "learning_rate": 3.6435494594616155e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6360 }, { "completion_length": 583.75, "epoch": 1.7630266075388028, "grad_norm": 0.0, "kl": 0.25373056530952454, "learning_rate": 3.643160135344338e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6361 }, { "completion_length": 556.5, "epoch": 1.7633037694013303, "grad_norm": 0.0, "kl": 0.6181463599205017, "learning_rate": 3.642770776171094e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6362 }, { "completion_length": 534.0, "epoch": 1.7635809312638582, "grad_norm": 0.0, "kl": 0.20083867013454437, "learning_rate": 3.6423813819538245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6363 }, { "completion_length": 635.5, "epoch": 1.7638580931263859, "grad_norm": 0.35690727829933167, "kl": 0.18677303194999695, "learning_rate": 3.64199195270447e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6364 }, { "completion_length": 705.25, "epoch": 1.7641352549889135, "grad_norm": 0.0, "kl": 0.27364057302474976, "learning_rate": 3.6416024884349727e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6365 }, { "completion_length": 636.0, "epoch": 1.7644124168514412, "grad_norm": 0.40749767422676086, "kl": 113564.3046875, "learning_rate": 3.6412129891572766e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6366 }, { "completion_length": 765.25, "epoch": 1.764689578713969, "grad_norm": 1.4111467599868774, "kl": 35922.34375, "learning_rate": 3.640823454883325e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6367 }, { "completion_length": 573.5, "epoch": 1.7649667405764968, "grad_norm": 0.0, "kl": 0.21487738192081451, "learning_rate": 3.6404338856250636e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6368 }, { "completion_length": 589.75, "epoch": 1.7652439024390243, "grad_norm": 0.0, "kl": 0.19578295946121216, "learning_rate": 3.6400442813944394e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6369 }, { "completion_length": 790.5, "epoch": 1.7655210643015522, "grad_norm": 0.0, "kl": 0.19687046110630035, "learning_rate": 3.6396546422033998e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6370 }, { "completion_length": 616.0, "epoch": 1.7657982261640797, "grad_norm": 0.0, "kl": 0.22980739176273346, "learning_rate": 3.6392649680638926e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6371 }, { "completion_length": 667.25, "epoch": 1.7660753880266076, "grad_norm": 0.0, "kl": 38454564.0, "learning_rate": 3.6388752589878685e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6372 }, { "completion_length": 597.0, "epoch": 1.7663525498891353, "grad_norm": 0.0, "kl": 0.25911614298820496, "learning_rate": 3.6384855149872776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6373 }, { "completion_length": 555.0, "epoch": 1.766629711751663, "grad_norm": 0.0, "kl": 0.2814558744430542, "learning_rate": 3.638095736074072e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6374 }, { "completion_length": 580.75, "epoch": 1.7669068736141909, "grad_norm": 0.0, "kl": 68683248.0, "learning_rate": 3.6377059222602047e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6375 }, { "completion_length": 525.25, "epoch": 1.7671840354767183, "grad_norm": 0.0, "kl": 0.22108116745948792, "learning_rate": 3.6373160735576285e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6376 }, { "completion_length": 742.0, "epoch": 1.7674611973392462, "grad_norm": 0.0, "kl": 0.20223085582256317, "learning_rate": 3.6369261899783005e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6377 }, { "completion_length": 679.5, "epoch": 1.7677383592017737, "grad_norm": 0.0, "kl": 0.19278161227703094, "learning_rate": 3.6365362715341756e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6378 }, { "completion_length": 526.25, "epoch": 1.7680155210643016, "grad_norm": 0.0, "kl": 0.713674783706665, "learning_rate": 3.636146318237211e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6379 }, { "completion_length": 651.0, "epoch": 1.7682926829268293, "grad_norm": 0.0, "kl": 0.18310776352882385, "learning_rate": 3.6357563300993648e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6380 }, { "completion_length": 541.5, "epoch": 1.768569844789357, "grad_norm": 0.0, "kl": 0.3877178728580475, "learning_rate": 3.6353663071325973e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6381 }, { "completion_length": 620.5, "epoch": 1.7688470066518847, "grad_norm": 0.0, "kl": 0.35017165541648865, "learning_rate": 3.634976249348867e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6382 }, { "completion_length": 577.5, "epoch": 1.7691241685144123, "grad_norm": 0.0, "kl": 0.18814554810523987, "learning_rate": 3.6345861567601376e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6383 }, { "completion_length": 654.75, "epoch": 1.7694013303769403, "grad_norm": 0.0, "kl": 0.17766156792640686, "learning_rate": 3.6341960293783694e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6384 }, { "completion_length": 715.5, "epoch": 1.7696784922394677, "grad_norm": 0.35931196808815, "kl": 4851891.5, "learning_rate": 3.6338058672155284e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6385 }, { "completion_length": 543.25, "epoch": 1.7699556541019956, "grad_norm": 0.0, "kl": 0.3105721175670624, "learning_rate": 3.633415670283578e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6386 }, { "completion_length": 624.5, "epoch": 1.7702328159645233, "grad_norm": 0.0, "kl": 0.18825004994869232, "learning_rate": 3.633025438594483e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6387 }, { "completion_length": 671.5, "epoch": 1.770509977827051, "grad_norm": 2.0272738933563232, "kl": 22635.353515625, "learning_rate": 3.6326351721602115e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6388 }, { "completion_length": 593.75, "epoch": 1.7707871396895787, "grad_norm": 0.0, "kl": 1.0133368968963623, "learning_rate": 3.632244870992731e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6389 }, { "completion_length": 604.0, "epoch": 1.7710643015521064, "grad_norm": 0.0, "kl": 0.22119195759296417, "learning_rate": 3.6318545351040104e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6390 }, { "completion_length": 587.25, "epoch": 1.7713414634146343, "grad_norm": 0.0, "kl": 3.3859307765960693, "learning_rate": 3.631464164506019e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6391 }, { "completion_length": 671.0, "epoch": 1.7716186252771617, "grad_norm": 0.0, "kl": 0.18720105290412903, "learning_rate": 3.6310737592107283e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6392 }, { "completion_length": 624.75, "epoch": 1.7718957871396896, "grad_norm": 0.0, "kl": 0.19572915136814117, "learning_rate": 3.6306833192301117e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6393 }, { "completion_length": 534.5, "epoch": 1.7721729490022173, "grad_norm": 0.0, "kl": 76.50952911376953, "learning_rate": 3.63029284457614e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6394 }, { "completion_length": 597.0, "epoch": 1.772450110864745, "grad_norm": 0.0, "kl": 0.3893226385116577, "learning_rate": 3.6299023352607894e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6395 }, { "completion_length": 595.0, "epoch": 1.7727272727272727, "grad_norm": 0.0, "kl": 0.3230378031730652, "learning_rate": 3.6295117912960345e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6396 }, { "completion_length": 594.25, "epoch": 1.7730044345898004, "grad_norm": 0.0, "kl": 0.2130642682313919, "learning_rate": 3.6291212126938513e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6397 }, { "completion_length": 586.75, "epoch": 1.7732815964523283, "grad_norm": 0.0, "kl": 3037482.5, "learning_rate": 3.6287305994662173e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6398 }, { "completion_length": 608.5, "epoch": 1.7735587583148558, "grad_norm": 0.0, "kl": 0.21442529559135437, "learning_rate": 3.628339951625112e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6399 }, { "completion_length": 614.25, "epoch": 1.7738359201773837, "grad_norm": 0.0, "kl": 0.1859341412782669, "learning_rate": 3.6279492691825135e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6400 }, { "completion_length": 585.0, "epoch": 1.7741130820399114, "grad_norm": 0.0, "kl": 0.19674725830554962, "learning_rate": 3.627558552150403e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6401 }, { "completion_length": 628.25, "epoch": 1.774390243902439, "grad_norm": 0.0, "kl": 0.16892778873443604, "learning_rate": 3.627167800540763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6402 }, { "completion_length": 603.75, "epoch": 1.7746674057649667, "grad_norm": 0.0, "kl": 1.950382947921753, "learning_rate": 3.6267770143655743e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6403 }, { "completion_length": 588.0, "epoch": 1.7749445676274944, "grad_norm": 0.0, "kl": 0.23370800912380219, "learning_rate": 3.6263861936368234e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6404 }, { "completion_length": 668.0, "epoch": 1.7752217294900223, "grad_norm": 0.0, "kl": 0.38038772344589233, "learning_rate": 3.6259953383664924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6405 }, { "completion_length": 608.75, "epoch": 1.7754988913525498, "grad_norm": 0.0, "kl": 0.21043093502521515, "learning_rate": 3.6256044485665686e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6406 }, { "completion_length": 653.0, "epoch": 1.7757760532150777, "grad_norm": 0.0, "kl": 0.24545086920261383, "learning_rate": 3.6252135242490387e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6407 }, { "completion_length": 539.75, "epoch": 1.7760532150776052, "grad_norm": 0.0, "kl": 0.42513787746429443, "learning_rate": 3.624822565425891e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6408 }, { "completion_length": 605.75, "epoch": 1.776330376940133, "grad_norm": 0.0, "kl": 0.3264833688735962, "learning_rate": 3.6244315721091145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6409 }, { "completion_length": 593.25, "epoch": 1.7766075388026608, "grad_norm": 0.0, "kl": 0.1632528454065323, "learning_rate": 3.624040544310699e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6410 }, { "completion_length": 598.0, "epoch": 1.7768847006651884, "grad_norm": 0.4845138192176819, "kl": 52934.26171875, "learning_rate": 3.623649482042636e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6411 }, { "completion_length": 629.0, "epoch": 1.7771618625277164, "grad_norm": 0.0, "kl": 0.2232140302658081, "learning_rate": 3.623258385316917e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6412 }, { "completion_length": 739.5, "epoch": 1.7774390243902438, "grad_norm": 0.0, "kl": 0.2271333932876587, "learning_rate": 3.622867254145537e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6413 }, { "completion_length": 594.75, "epoch": 1.7777161862527717, "grad_norm": 0.0, "kl": 0.23884959518909454, "learning_rate": 3.6224760885404885e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6414 }, { "completion_length": 595.25, "epoch": 1.7779933481152992, "grad_norm": 0.0, "kl": 0.23336634039878845, "learning_rate": 3.622084888513768e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6415 }, { "completion_length": 591.75, "epoch": 1.778270509977827, "grad_norm": 0.0, "kl": 0.18865318596363068, "learning_rate": 3.621693654077372e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6416 }, { "completion_length": 674.75, "epoch": 1.7785476718403548, "grad_norm": 0.0, "kl": 0.2591225206851959, "learning_rate": 3.6213023852432975e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6417 }, { "completion_length": 637.75, "epoch": 1.7788248337028825, "grad_norm": 0.0, "kl": 1637413.75, "learning_rate": 3.620911082023544e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6418 }, { "completion_length": 641.0, "epoch": 1.7791019955654102, "grad_norm": 0.0, "kl": 0.5949278473854065, "learning_rate": 3.6205197444301097e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6419 }, { "completion_length": 566.0, "epoch": 1.7793791574279378, "grad_norm": 0.0, "kl": 0.229505255818367, "learning_rate": 3.6201283724749965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6420 }, { "completion_length": 607.25, "epoch": 1.7796563192904657, "grad_norm": 0.0, "kl": 0.2729884684085846, "learning_rate": 3.6197369661702052e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6421 }, { "completion_length": 585.0, "epoch": 1.7799334811529932, "grad_norm": 0.0, "kl": 0.30225616693496704, "learning_rate": 3.6193455255277393e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6422 }, { "completion_length": 674.5, "epoch": 1.7802106430155211, "grad_norm": 0.0, "kl": 0.21616767346858978, "learning_rate": 3.6189540505596033e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6423 }, { "completion_length": 640.5, "epoch": 1.7804878048780488, "grad_norm": 0.0, "kl": 0.1468362659215927, "learning_rate": 3.6185625412778005e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6424 }, { "completion_length": 631.0, "epoch": 1.7807649667405765, "grad_norm": 0.0, "kl": 0.19914668798446655, "learning_rate": 3.618170997694338e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6425 }, { "completion_length": 706.75, "epoch": 1.7810421286031042, "grad_norm": 0.0, "kl": 0.17576439678668976, "learning_rate": 3.6177794198212225e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6426 }, { "completion_length": 566.0, "epoch": 1.7813192904656319, "grad_norm": 0.0, "kl": 0.26264163851737976, "learning_rate": 3.617387807670461e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6427 }, { "completion_length": 719.75, "epoch": 1.7815964523281598, "grad_norm": 0.0, "kl": 0.2968905568122864, "learning_rate": 3.6169961612540648e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6428 }, { "completion_length": 623.75, "epoch": 1.7818736141906872, "grad_norm": 2.2843005657196045, "kl": 1254508.25, "learning_rate": 3.6166044805840427e-06, "loss": 0.0, "reward": 1.625, "reward_std": 0.25, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6429 }, { "completion_length": 688.5, "epoch": 1.7821507760532151, "grad_norm": 0.0, "kl": 1676.2861328125, "learning_rate": 3.616212765672406e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6430 }, { "completion_length": 632.0, "epoch": 1.7824279379157428, "grad_norm": 0.0, "kl": 0.18983317911624908, "learning_rate": 3.615821016531167e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6431 }, { "completion_length": 672.0, "epoch": 1.7827050997782705, "grad_norm": 0.0, "kl": 0.511247992515564, "learning_rate": 3.6154292331723396e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6432 }, { "completion_length": 652.75, "epoch": 1.7829822616407982, "grad_norm": 0.4593806564807892, "kl": 967620.125, "learning_rate": 3.615037415607937e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6433 }, { "completion_length": 757.75, "epoch": 1.783259423503326, "grad_norm": 0.0, "kl": 12.363323211669922, "learning_rate": 3.6146455638499763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6434 }, { "completion_length": 693.75, "epoch": 1.7835365853658538, "grad_norm": 0.0, "kl": 0.3082387447357178, "learning_rate": 3.614253677910472e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6435 }, { "completion_length": 623.0, "epoch": 1.7838137472283813, "grad_norm": 0.0, "kl": 0.2582436501979828, "learning_rate": 3.613861757801443e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6436 }, { "completion_length": 700.25, "epoch": 1.7840909090909092, "grad_norm": 0.0, "kl": 0.18772973120212555, "learning_rate": 3.613469803534907e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6437 }, { "completion_length": 658.25, "epoch": 1.7843680709534369, "grad_norm": 0.0, "kl": 0.2210942506790161, "learning_rate": 3.6130778151228853e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6438 }, { "completion_length": 610.25, "epoch": 1.7846452328159645, "grad_norm": 0.0, "kl": 0.20980411767959595, "learning_rate": 3.612685792577396e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6439 }, { "completion_length": 863.25, "epoch": 1.7849223946784922, "grad_norm": NaN, "kl": 0.30006295442581177, "learning_rate": 3.6122937359104625e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6440 }, { "completion_length": 608.0, "epoch": 1.78519955654102, "grad_norm": 0.0, "kl": 0.2498069703578949, "learning_rate": 3.6122937359104625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6441 }, { "completion_length": 667.5, "epoch": 1.7854767184035478, "grad_norm": 0.0, "kl": 0.24504634737968445, "learning_rate": 3.611901645134107e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6442 }, { "completion_length": 567.0, "epoch": 1.7857538802660753, "grad_norm": 0.0, "kl": 0.40605321526527405, "learning_rate": 3.6115095202603532e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6443 }, { "completion_length": 565.25, "epoch": 1.7860310421286032, "grad_norm": 0.0, "kl": 0.19710713624954224, "learning_rate": 3.6111173613012267e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6444 }, { "completion_length": 650.5, "epoch": 1.7863082039911307, "grad_norm": 0.0, "kl": 0.2038268744945526, "learning_rate": 3.6107251682687532e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6445 }, { "completion_length": 632.25, "epoch": 1.7865853658536586, "grad_norm": 0.0, "kl": 9797944.0, "learning_rate": 3.610332941174959e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6446 }, { "completion_length": 579.75, "epoch": 1.7868625277161863, "grad_norm": 0.0, "kl": 0.2764040231704712, "learning_rate": 3.6099406800318714e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6447 }, { "completion_length": 560.0, "epoch": 1.787139689578714, "grad_norm": 0.0, "kl": 0.2223089337348938, "learning_rate": 3.6095483848515223e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6448 }, { "completion_length": 857.5, "epoch": 1.7874168514412418, "grad_norm": 0.0, "kl": 0.1607898622751236, "learning_rate": 3.609156055645938e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6449 }, { "completion_length": 591.25, "epoch": 1.7876940133037693, "grad_norm": 0.0, "kl": 0.29694125056266785, "learning_rate": 3.608763692427153e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6450 }, { "completion_length": 656.25, "epoch": 1.7879711751662972, "grad_norm": 0.0, "kl": 0.2458232045173645, "learning_rate": 3.608371295207196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6451 }, { "completion_length": 522.75, "epoch": 1.7882483370288247, "grad_norm": 0.0, "kl": 0.6048442125320435, "learning_rate": 3.607978863998104e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6452 }, { "completion_length": 702.0, "epoch": 1.7885254988913526, "grad_norm": 0.0, "kl": 0.19957809150218964, "learning_rate": 3.607586398811908e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6453 }, { "completion_length": 673.0, "epoch": 1.7888026607538803, "grad_norm": 0.0, "kl": 0.14507035911083221, "learning_rate": 3.6071938996606454e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6454 }, { "completion_length": 614.5, "epoch": 1.789079822616408, "grad_norm": 0.0, "kl": 1.057713508605957, "learning_rate": 3.606801366556351e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6455 }, { "completion_length": 610.25, "epoch": 1.7893569844789357, "grad_norm": 0.0, "kl": 0.2528369426727295, "learning_rate": 3.6064087995110634e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6456 }, { "completion_length": 601.75, "epoch": 1.7896341463414633, "grad_norm": 0.0, "kl": 0.2374386340379715, "learning_rate": 3.6060161985368202e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6457 }, { "completion_length": 650.0, "epoch": 1.7899113082039912, "grad_norm": 1.6583367586135864, "kl": 13014.6884765625, "learning_rate": 3.6056235636456617e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6458 }, { "completion_length": 733.5, "epoch": 1.7901884700665187, "grad_norm": 0.0, "kl": 0.17052003741264343, "learning_rate": 3.605230894849627e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6459 }, { "completion_length": 579.75, "epoch": 1.7904656319290466, "grad_norm": 0.0, "kl": 0.2767399847507477, "learning_rate": 3.604838192160759e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6460 }, { "completion_length": 742.0, "epoch": 1.7907427937915743, "grad_norm": 0.0, "kl": 0.6941555142402649, "learning_rate": 3.604445455591099e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6461 }, { "completion_length": 697.25, "epoch": 1.791019955654102, "grad_norm": 0.0, "kl": 0.21289776265621185, "learning_rate": 3.6040526851526914e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6462 }, { "completion_length": 656.0, "epoch": 1.7912971175166297, "grad_norm": 0.0, "kl": 0.21096637845039368, "learning_rate": 3.603659880857581e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6463 }, { "completion_length": 605.5, "epoch": 1.7915742793791574, "grad_norm": 0.0, "kl": 0.1678292155265808, "learning_rate": 3.603267042717813e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6464 }, { "completion_length": 662.5, "epoch": 1.7918514412416853, "grad_norm": 0.0, "kl": 0.21533960103988647, "learning_rate": 3.602874170745435e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6465 }, { "completion_length": 543.0, "epoch": 1.7921286031042127, "grad_norm": 0.0, "kl": 0.20254391431808472, "learning_rate": 3.602481264952493e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6466 }, { "completion_length": 578.75, "epoch": 1.7924057649667406, "grad_norm": 0.0, "kl": 0.18584221601486206, "learning_rate": 3.6020883253510377e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6467 }, { "completion_length": 583.25, "epoch": 1.7926829268292683, "grad_norm": 0.0, "kl": 0.36546847224235535, "learning_rate": 3.6016953519531166e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6468 }, { "completion_length": 526.75, "epoch": 1.792960088691796, "grad_norm": 0.0, "kl": 0.22002391517162323, "learning_rate": 3.6013023447707836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6469 }, { "completion_length": 709.25, "epoch": 1.7932372505543237, "grad_norm": 0.0, "kl": 0.4961562752723694, "learning_rate": 3.600909303816088e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6470 }, { "completion_length": 718.25, "epoch": 1.7935144124168514, "grad_norm": 0.0, "kl": 0.2153029590845108, "learning_rate": 3.600516229101084e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6471 }, { "completion_length": 697.0, "epoch": 1.7937915742793793, "grad_norm": 0.0, "kl": 0.2342739999294281, "learning_rate": 3.6001231206378256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6472 }, { "completion_length": 688.25, "epoch": 1.7940687361419068, "grad_norm": 0.0, "kl": 0.31942394375801086, "learning_rate": 3.5997299784383676e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6473 }, { "completion_length": 610.5, "epoch": 1.7943458980044347, "grad_norm": 0.0, "kl": 331544.71875, "learning_rate": 3.5993368025147647e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6474 }, { "completion_length": 622.75, "epoch": 1.7946230598669624, "grad_norm": 0.0, "kl": 0.2837570905685425, "learning_rate": 3.5989435928790763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6475 }, { "completion_length": 754.25, "epoch": 1.79490022172949, "grad_norm": 0.0, "kl": 0.18369202315807343, "learning_rate": 3.5985503495433593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6476 }, { "completion_length": 682.5, "epoch": 1.7951773835920177, "grad_norm": 0.0, "kl": 0.1540602743625641, "learning_rate": 3.5981570725196728e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6477 }, { "completion_length": 723.75, "epoch": 1.7954545454545454, "grad_norm": 0.7124887704849243, "kl": 0.15256449580192566, "learning_rate": 3.597763761820077e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6478 }, { "completion_length": 613.75, "epoch": 1.7957317073170733, "grad_norm": 0.8285497426986694, "kl": 131.0375213623047, "learning_rate": 3.597370417456633e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6479 }, { "completion_length": 657.25, "epoch": 1.7960088691796008, "grad_norm": 0.0, "kl": 0.20313052833080292, "learning_rate": 3.596977039441404e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6480 }, { "completion_length": 566.5, "epoch": 1.7962860310421287, "grad_norm": 0.0, "kl": 0.2516306936740875, "learning_rate": 3.596583627786452e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6481 }, { "completion_length": 799.0, "epoch": 1.7965631929046562, "grad_norm": 0.0, "kl": 0.16500547528266907, "learning_rate": 3.5961901825038416e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6482 }, { "completion_length": 789.0, "epoch": 1.796840354767184, "grad_norm": 0.0, "kl": 1925164.875, "learning_rate": 3.5957967036056387e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6483 }, { "completion_length": 684.5, "epoch": 1.7971175166297118, "grad_norm": 0.0, "kl": 0.19770462810993195, "learning_rate": 3.595403191103909e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6484 }, { "completion_length": 644.5, "epoch": 1.7973946784922394, "grad_norm": 0.0, "kl": 0.2545994222164154, "learning_rate": 3.59500964501072e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6485 }, { "completion_length": 632.75, "epoch": 1.7976718403547673, "grad_norm": 0.0, "kl": 0.2270216941833496, "learning_rate": 3.594616065338141e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6486 }, { "completion_length": 613.0, "epoch": 1.7979490022172948, "grad_norm": 0.0, "kl": 0.20650015771389008, "learning_rate": 3.5942224520982405e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6487 }, { "completion_length": 644.25, "epoch": 1.7982261640798227, "grad_norm": 0.0, "kl": 0.2492886781692505, "learning_rate": 3.5938288053030896e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6488 }, { "completion_length": 581.5, "epoch": 1.7985033259423502, "grad_norm": 0.0, "kl": 0.42920562624931335, "learning_rate": 3.593435124964758e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6489 }, { "completion_length": 558.5, "epoch": 1.798780487804878, "grad_norm": 0.0, "kl": 0.19641628861427307, "learning_rate": 3.5930414110953214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6490 }, { "completion_length": 715.5, "epoch": 1.7990576496674058, "grad_norm": 0.0, "kl": 0.19222694635391235, "learning_rate": 3.5926476637068506e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6491 }, { "completion_length": 889.5, "epoch": 1.7993348115299335, "grad_norm": 0.0, "kl": 0.17179733514785767, "learning_rate": 3.592253882811422e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6492 }, { "completion_length": 581.0, "epoch": 1.7996119733924612, "grad_norm": 0.0, "kl": 123694.234375, "learning_rate": 3.5918600684211103e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6493 }, { "completion_length": 611.25, "epoch": 1.7998891352549888, "grad_norm": 0.0, "kl": 0.27876192331314087, "learning_rate": 3.5914662205479923e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6494 }, { "completion_length": 707.75, "epoch": 1.8001662971175167, "grad_norm": 0.0, "kl": 0.20000720024108887, "learning_rate": 3.591072339204146e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6495 }, { "completion_length": 618.25, "epoch": 1.8004434589800442, "grad_norm": 0.0, "kl": 0.19255012273788452, "learning_rate": 3.590678424401649e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6496 }, { "completion_length": 650.5, "epoch": 1.8007206208425721, "grad_norm": 0.0, "kl": 0.3159656822681427, "learning_rate": 3.5902844761525817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6497 }, { "completion_length": 656.25, "epoch": 1.8009977827050998, "grad_norm": 0.0, "kl": 0.2002784162759781, "learning_rate": 3.5898904944690256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6498 }, { "completion_length": 724.5, "epoch": 1.8012749445676275, "grad_norm": 0.0, "kl": 0.1714971512556076, "learning_rate": 3.589496479363062e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6499 }, { "completion_length": 818.75, "epoch": 1.8015521064301552, "grad_norm": 5.4785003662109375, "kl": 19.4033203125, "learning_rate": 3.589102430846773e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6500 }, { "completion_length": 835.75, "epoch": 1.8018292682926829, "grad_norm": 0.0, "kl": 0.19300729036331177, "learning_rate": 3.588708348932244e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6501 }, { "completion_length": 657.0, "epoch": 1.8021064301552108, "grad_norm": 0.0, "kl": 0.17261956632137299, "learning_rate": 3.588314233631558e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6502 }, { "completion_length": 612.25, "epoch": 1.8023835920177382, "grad_norm": 0.0, "kl": 0.22228239476680756, "learning_rate": 3.587920084956802e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6503 }, { "completion_length": 614.0, "epoch": 1.8026607538802661, "grad_norm": 0.3904837965965271, "kl": 778038.125, "learning_rate": 3.587525902920062e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6504 }, { "completion_length": 572.0, "epoch": 1.8029379157427938, "grad_norm": 0.0, "kl": 0.23645132780075073, "learning_rate": 3.5871316875334272e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6505 }, { "completion_length": 705.75, "epoch": 1.8032150776053215, "grad_norm": 0.0, "kl": 0.24899983406066895, "learning_rate": 3.586737438808986e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6506 }, { "completion_length": 709.0, "epoch": 1.8034922394678492, "grad_norm": 0.544419527053833, "kl": 0.18688087165355682, "learning_rate": 3.586343156758828e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6507 }, { "completion_length": 682.25, "epoch": 1.8037694013303769, "grad_norm": 0.0, "kl": 111967.9375, "learning_rate": 3.585948841395045e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6508 }, { "completion_length": 564.25, "epoch": 1.8040465631929048, "grad_norm": 0.0, "kl": 0.2986615300178528, "learning_rate": 3.5855544927297283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6509 }, { "completion_length": 760.0, "epoch": 1.8043237250554323, "grad_norm": 0.3728949725627899, "kl": 0.17570209503173828, "learning_rate": 3.58516011077497e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6510 }, { "completion_length": 584.25, "epoch": 1.8046008869179602, "grad_norm": 0.0, "kl": 1283920.375, "learning_rate": 3.584765695542866e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6511 }, { "completion_length": 605.25, "epoch": 1.8048780487804879, "grad_norm": 0.0, "kl": 0.28337642550468445, "learning_rate": 3.5843712470455106e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6512 }, { "completion_length": 719.75, "epoch": 1.8051552106430155, "grad_norm": 1.468687653541565, "kl": 19537.896484375, "learning_rate": 3.5839767652949998e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6513 }, { "completion_length": 739.75, "epoch": 1.8054323725055432, "grad_norm": 0.0, "kl": 0.22197821736335754, "learning_rate": 3.583582250303432e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6514 }, { "completion_length": 679.25, "epoch": 1.805709534368071, "grad_norm": 0.0, "kl": 0.15535444021224976, "learning_rate": 3.5831877020829024e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6515 }, { "completion_length": 616.5, "epoch": 1.8059866962305988, "grad_norm": 0.0, "kl": 0.21653065085411072, "learning_rate": 3.5827931206455135e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6516 }, { "completion_length": 744.5, "epoch": 1.8062638580931263, "grad_norm": 0.0, "kl": 0.15860368311405182, "learning_rate": 3.582398506003363e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6517 }, { "completion_length": 720.5, "epoch": 1.8065410199556542, "grad_norm": 0.0, "kl": 0.17963556945323944, "learning_rate": 3.5820038581685536e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6518 }, { "completion_length": 664.75, "epoch": 1.8068181818181817, "grad_norm": 0.0, "kl": 0.2595370411872864, "learning_rate": 3.5816091771531858e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6519 }, { "completion_length": 585.75, "epoch": 1.8070953436807096, "grad_norm": 0.0, "kl": 0.19844914972782135, "learning_rate": 3.5812144629693647e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6520 }, { "completion_length": 651.25, "epoch": 1.8073725055432373, "grad_norm": 1.5571303367614746, "kl": 0.22903263568878174, "learning_rate": 3.5808197156291936e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6521 }, { "completion_length": 678.5, "epoch": 1.807649667405765, "grad_norm": 0.0, "kl": 0.2798127830028534, "learning_rate": 3.580424935144778e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6522 }, { "completion_length": 605.25, "epoch": 1.8079268292682928, "grad_norm": 0.0, "kl": 0.2566535472869873, "learning_rate": 3.580030121528224e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6523 }, { "completion_length": 667.5, "epoch": 1.8082039911308203, "grad_norm": 0.0, "kl": 0.21160121262073517, "learning_rate": 3.579635274791639e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6524 }, { "completion_length": 585.75, "epoch": 1.8084811529933482, "grad_norm": 0.0, "kl": 0.2088204026222229, "learning_rate": 3.5792403949471312e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6525 }, { "completion_length": 672.75, "epoch": 1.8087583148558757, "grad_norm": 0.36041274666786194, "kl": 6915272.5, "learning_rate": 3.57884548200681e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6526 }, { "completion_length": 632.25, "epoch": 1.8090354767184036, "grad_norm": 0.0, "kl": 0.24062730371952057, "learning_rate": 3.578450535982786e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6527 }, { "completion_length": 707.5, "epoch": 1.8093126385809313, "grad_norm": 0.0, "kl": 2.1420085430145264, "learning_rate": 3.57805555688717e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6528 }, { "completion_length": 670.75, "epoch": 1.809589800443459, "grad_norm": 0.0, "kl": 0.1910804659128189, "learning_rate": 3.577660544732075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6529 }, { "completion_length": 673.5, "epoch": 1.8098669623059866, "grad_norm": 0.0, "kl": 0.18882229924201965, "learning_rate": 3.5772654995296136e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6530 }, { "completion_length": 651.25, "epoch": 1.8101441241685143, "grad_norm": 0.0, "kl": 0.23730681836605072, "learning_rate": 3.5768704212919013e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6531 }, { "completion_length": 577.75, "epoch": 1.8104212860310422, "grad_norm": 0.0, "kl": 72519912.0, "learning_rate": 3.5764753100310522e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6532 }, { "completion_length": 589.75, "epoch": 1.8106984478935697, "grad_norm": 0.0, "kl": 0.19345904886722565, "learning_rate": 3.5760801657591837e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6533 }, { "completion_length": 725.5, "epoch": 1.8109756097560976, "grad_norm": 0.0, "kl": 0.19116389751434326, "learning_rate": 3.5756849884884124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6534 }, { "completion_length": 664.0, "epoch": 1.8112527716186253, "grad_norm": 0.0, "kl": 0.24859973788261414, "learning_rate": 3.5752897782308583e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6535 }, { "completion_length": 563.75, "epoch": 1.811529933481153, "grad_norm": 0.0, "kl": 0.229396253824234, "learning_rate": 3.5748945349986393e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6536 }, { "completion_length": 551.0, "epoch": 1.8118070953436807, "grad_norm": 0.0, "kl": 0.18065553903579712, "learning_rate": 3.574499258803876e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6537 }, { "completion_length": 670.0, "epoch": 1.8120842572062084, "grad_norm": 0.0, "kl": 0.24858231842517853, "learning_rate": 3.5741039496586906e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6538 }, { "completion_length": 617.75, "epoch": 1.8123614190687363, "grad_norm": 0.0, "kl": 4.016302108764648, "learning_rate": 3.5737086075752054e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6539 }, { "completion_length": 811.0, "epoch": 1.8126385809312637, "grad_norm": 0.0, "kl": 0.3352229595184326, "learning_rate": 3.5733132325655434e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6540 }, { "completion_length": 610.25, "epoch": 1.8129157427937916, "grad_norm": 0.4789471924304962, "kl": 0.20808231830596924, "learning_rate": 3.5729178246418295e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6541 }, { "completion_length": 640.25, "epoch": 1.8131929046563193, "grad_norm": 0.0, "kl": 0.21672910451889038, "learning_rate": 3.5725223838161888e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6542 }, { "completion_length": 702.0, "epoch": 1.813470066518847, "grad_norm": 0.0, "kl": 0.30091673135757446, "learning_rate": 3.5721269101007487e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6543 }, { "completion_length": 620.5, "epoch": 1.8137472283813747, "grad_norm": 0.4229271411895752, "kl": 107106928.0, "learning_rate": 3.5717314035076355e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6544 }, { "completion_length": 734.75, "epoch": 1.8140243902439024, "grad_norm": 0.0, "kl": 0.22311024367809296, "learning_rate": 3.5713358640489793e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6545 }, { "completion_length": 697.0, "epoch": 1.8143015521064303, "grad_norm": 0.0, "kl": 0.24946917593479156, "learning_rate": 3.5709402917369086e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6546 }, { "completion_length": 642.75, "epoch": 1.8145787139689578, "grad_norm": 0.0, "kl": 0.17289792001247406, "learning_rate": 3.5705446865835537e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6547 }, { "completion_length": 620.25, "epoch": 1.8148558758314857, "grad_norm": 0.0, "kl": 0.23140181601047516, "learning_rate": 3.570149048601047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6548 }, { "completion_length": 569.75, "epoch": 1.8151330376940134, "grad_norm": 0.0, "kl": 0.17502112686634064, "learning_rate": 3.5697533778015205e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6549 }, { "completion_length": 708.0, "epoch": 1.815410199556541, "grad_norm": 0.0, "kl": 0.1876138150691986, "learning_rate": 3.569357674197108e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6550 }, { "completion_length": 589.5, "epoch": 1.8156873614190687, "grad_norm": 0.0, "kl": 0.2108003944158554, "learning_rate": 3.568961937799944e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6551 }, { "completion_length": 634.5, "epoch": 1.8159645232815964, "grad_norm": 0.0, "kl": 0.22157517075538635, "learning_rate": 3.5685661686221644e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6552 }, { "completion_length": 613.25, "epoch": 1.8162416851441243, "grad_norm": 0.0, "kl": 0.2988227605819702, "learning_rate": 3.5681703666759055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6553 }, { "completion_length": 632.75, "epoch": 1.8165188470066518, "grad_norm": 0.0, "kl": 0.22186075150966644, "learning_rate": 3.567774531973305e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6554 }, { "completion_length": 703.0, "epoch": 1.8167960088691797, "grad_norm": 0.0, "kl": 980354.25, "learning_rate": 3.5673786645265013e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6555 }, { "completion_length": 628.0, "epoch": 1.8170731707317072, "grad_norm": 0.0, "kl": 0.1955552101135254, "learning_rate": 3.566982764347634e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6556 }, { "completion_length": 711.5, "epoch": 1.817350332594235, "grad_norm": 0.0, "kl": 0.1748417615890503, "learning_rate": 3.5665868314488437e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6557 }, { "completion_length": 611.5, "epoch": 1.8176274944567627, "grad_norm": 0.0, "kl": 0.16648295521736145, "learning_rate": 3.5661908658422727e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6558 }, { "completion_length": 562.0, "epoch": 1.8179046563192904, "grad_norm": 0.0, "kl": 0.19257155060768127, "learning_rate": 3.5657948675400626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6559 }, { "completion_length": 708.75, "epoch": 1.8181818181818183, "grad_norm": 0.0, "kl": 0.22456516325473785, "learning_rate": 3.5653988365543574e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6560 }, { "completion_length": 632.25, "epoch": 1.8184589800443458, "grad_norm": 0.0, "kl": 0.21331281960010529, "learning_rate": 3.5650027728973023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6561 }, { "completion_length": 577.25, "epoch": 1.8187361419068737, "grad_norm": 0.0, "kl": 0.19481538236141205, "learning_rate": 3.564606676581043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6562 }, { "completion_length": 742.75, "epoch": 1.8190133037694012, "grad_norm": 0.0, "kl": 0.37618494033813477, "learning_rate": 3.564210547617724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6563 }, { "completion_length": 554.0, "epoch": 1.819290465631929, "grad_norm": 0.0, "kl": 0.1972494274377823, "learning_rate": 3.5638143860194953e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6564 }, { "completion_length": 758.0, "epoch": 1.8195676274944568, "grad_norm": 0.0, "kl": 0.20669105648994446, "learning_rate": 3.5634181917985057e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6565 }, { "completion_length": 624.75, "epoch": 1.8198447893569845, "grad_norm": 0.0, "kl": 0.7320138812065125, "learning_rate": 3.563021964966902e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6566 }, { "completion_length": 577.25, "epoch": 1.8201219512195121, "grad_norm": 0.0, "kl": 0.19808073341846466, "learning_rate": 3.5626257055368384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6567 }, { "completion_length": 569.25, "epoch": 1.8203991130820398, "grad_norm": 0.0, "kl": 0.3932497799396515, "learning_rate": 3.562229413520464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6568 }, { "completion_length": 628.75, "epoch": 1.8206762749445677, "grad_norm": 0.0, "kl": 1.5017980337142944, "learning_rate": 3.561833088929933e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6569 }, { "completion_length": 719.25, "epoch": 1.8209534368070952, "grad_norm": 0.41436687111854553, "kl": 13477.23828125, "learning_rate": 3.5614367317773974e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6570 }, { "completion_length": 581.5, "epoch": 1.8212305986696231, "grad_norm": 0.0, "kl": 0.1803320050239563, "learning_rate": 3.5610403420750127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6571 }, { "completion_length": 712.25, "epoch": 1.8215077605321508, "grad_norm": 0.0, "kl": 0.16710089147090912, "learning_rate": 3.560643919834935e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6572 }, { "completion_length": 653.75, "epoch": 1.8217849223946785, "grad_norm": 0.0, "kl": 0.1980120688676834, "learning_rate": 3.5602474650693203e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6573 }, { "completion_length": 638.25, "epoch": 1.8220620842572062, "grad_norm": 0.0, "kl": 0.22095438838005066, "learning_rate": 3.559850977790326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6574 }, { "completion_length": 649.0, "epoch": 1.8223392461197339, "grad_norm": 0.0, "kl": 0.1919143795967102, "learning_rate": 3.559454458010112e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6575 }, { "completion_length": 670.75, "epoch": 1.8226164079822618, "grad_norm": 0.38575512170791626, "kl": 31147.626953125, "learning_rate": 3.559057905740836e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6576 }, { "completion_length": 634.0, "epoch": 1.8228935698447892, "grad_norm": 0.0, "kl": 334820.6875, "learning_rate": 3.5586613209946596e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6577 }, { "completion_length": 669.5, "epoch": 1.8231707317073171, "grad_norm": 0.0, "kl": 0.16873210668563843, "learning_rate": 3.5582647037837446e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6578 }, { "completion_length": 727.75, "epoch": 1.8234478935698448, "grad_norm": 0.0, "kl": 0.251909464597702, "learning_rate": 3.5578680541202543e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6579 }, { "completion_length": 702.75, "epoch": 1.8237250554323725, "grad_norm": 0.4768192768096924, "kl": 50179.3515625, "learning_rate": 3.55747137201635e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6580 }, { "completion_length": 667.5, "epoch": 1.8240022172949002, "grad_norm": 0.8776999115943909, "kl": 0.6986119151115417, "learning_rate": 3.5570746574841984e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6581 }, { "completion_length": 591.25, "epoch": 1.8242793791574279, "grad_norm": 0.0, "kl": 0.3702356815338135, "learning_rate": 3.556677910535965e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6582 }, { "completion_length": 726.25, "epoch": 1.8245565410199558, "grad_norm": 0.0, "kl": 0.1743023544549942, "learning_rate": 3.5562811311838146e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6583 }, { "completion_length": 653.25, "epoch": 1.8248337028824833, "grad_norm": 0.6618987917900085, "kl": 2.494009017944336, "learning_rate": 3.555884319439917e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6584 }, { "completion_length": 661.25, "epoch": 1.8251108647450112, "grad_norm": 0.0, "kl": 0.3046763837337494, "learning_rate": 3.5554874753164393e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6585 }, { "completion_length": 530.25, "epoch": 1.8253880266075388, "grad_norm": 0.0, "kl": 0.28551867604255676, "learning_rate": 3.5550905988255505e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6586 }, { "completion_length": 649.5, "epoch": 1.8256651884700665, "grad_norm": 0.0, "kl": 0.1635308861732483, "learning_rate": 3.554693689979423e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6587 }, { "completion_length": 620.25, "epoch": 1.8259423503325942, "grad_norm": 0.0, "kl": 0.17069870233535767, "learning_rate": 3.5542967487902276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6588 }, { "completion_length": 671.25, "epoch": 1.826219512195122, "grad_norm": 0.4000118672847748, "kl": 232444.875, "learning_rate": 3.5538997752701367e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6589 }, { "completion_length": 550.25, "epoch": 1.8264966740576498, "grad_norm": 0.9231567978858948, "kl": 2921.819091796875, "learning_rate": 3.5535027694313233e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6590 }, { "completion_length": 716.25, "epoch": 1.8267738359201773, "grad_norm": 0.0, "kl": 0.15160833299160004, "learning_rate": 3.553105731285963e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6591 }, { "completion_length": 598.0, "epoch": 1.8270509977827052, "grad_norm": 0.0, "kl": 0.23069749772548676, "learning_rate": 3.5527086608462295e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6592 }, { "completion_length": 718.0, "epoch": 1.8273281596452327, "grad_norm": 0.0, "kl": 0.2277626395225525, "learning_rate": 3.5523115581243017e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6593 }, { "completion_length": 575.25, "epoch": 1.8276053215077606, "grad_norm": 0.0, "kl": 0.196044459939003, "learning_rate": 3.551914423132356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6594 }, { "completion_length": 516.5, "epoch": 1.8278824833702882, "grad_norm": 0.0, "kl": 0.1589183658361435, "learning_rate": 3.55151725588257e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6595 }, { "completion_length": 689.75, "epoch": 1.828159645232816, "grad_norm": 0.0, "kl": 0.1488058865070343, "learning_rate": 3.551120056387124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6596 }, { "completion_length": 611.25, "epoch": 1.8284368070953438, "grad_norm": 0.0, "kl": 0.2349219024181366, "learning_rate": 3.550722824658199e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6597 }, { "completion_length": 543.0, "epoch": 1.8287139689578713, "grad_norm": 0.0, "kl": 0.19616296887397766, "learning_rate": 3.550325560707976e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6598 }, { "completion_length": 589.75, "epoch": 1.8289911308203992, "grad_norm": 0.0, "kl": 0.1987164467573166, "learning_rate": 3.5499282645486373e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6599 }, { "completion_length": 534.5, "epoch": 1.8292682926829267, "grad_norm": 0.0, "kl": 0.22117020189762115, "learning_rate": 3.549530936192366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6600 }, { "completion_length": 596.75, "epoch": 1.8295454545454546, "grad_norm": 0.44757604598999023, "kl": 0.21247413754463196, "learning_rate": 3.5491335756513467e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6601 }, { "completion_length": 649.25, "epoch": 1.8298226164079823, "grad_norm": 0.0, "kl": 0.19083265960216522, "learning_rate": 3.5487361829377648e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6602 }, { "completion_length": 656.25, "epoch": 1.83009977827051, "grad_norm": 0.0, "kl": 0.17874117195606232, "learning_rate": 3.548338758063807e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6603 }, { "completion_length": 609.5, "epoch": 1.8303769401330376, "grad_norm": 0.0, "kl": 0.20065176486968994, "learning_rate": 3.5479413010416606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6604 }, { "completion_length": 680.0, "epoch": 1.8306541019955653, "grad_norm": 0.0, "kl": 0.8458952903747559, "learning_rate": 3.5475438118835144e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6605 }, { "completion_length": 635.0, "epoch": 1.8309312638580932, "grad_norm": 0.0, "kl": 0.18211214244365692, "learning_rate": 3.5471462906015563e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6606 }, { "completion_length": 641.0, "epoch": 1.8312084257206207, "grad_norm": 0.0, "kl": 0.2154626101255417, "learning_rate": 3.5467487372079774e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6607 }, { "completion_length": 549.25, "epoch": 1.8314855875831486, "grad_norm": 0.0, "kl": 0.22948376834392548, "learning_rate": 3.546351151714969e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6608 }, { "completion_length": 661.75, "epoch": 1.8317627494456763, "grad_norm": 0.0, "kl": 5355.7255859375, "learning_rate": 3.5459535341347238e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6609 }, { "completion_length": 660.25, "epoch": 1.832039911308204, "grad_norm": 0.6096233129501343, "kl": 0.1677592247724533, "learning_rate": 3.545555884479435e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6610 }, { "completion_length": 545.5, "epoch": 1.8323170731707317, "grad_norm": 0.0, "kl": 0.17592434585094452, "learning_rate": 3.5451582027612964e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6611 }, { "completion_length": 600.75, "epoch": 1.8325942350332594, "grad_norm": 0.0, "kl": 0.16748961806297302, "learning_rate": 3.5447604889925042e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6612 }, { "completion_length": 738.25, "epoch": 1.8328713968957873, "grad_norm": 0.0, "kl": 0.1890149563550949, "learning_rate": 3.544362743185253e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6613 }, { "completion_length": 505.75, "epoch": 1.8331485587583147, "grad_norm": 0.4043618440628052, "kl": 59234.2734375, "learning_rate": 3.5439649653517416e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6614 }, { "completion_length": 537.25, "epoch": 1.8334257206208426, "grad_norm": 0.0, "kl": 0.1890200823545456, "learning_rate": 3.543567155504167e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6615 }, { "completion_length": 563.25, "epoch": 1.8337028824833703, "grad_norm": 0.0, "kl": 0.15880022943019867, "learning_rate": 3.5431693136547283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6616 }, { "completion_length": 559.75, "epoch": 1.833980044345898, "grad_norm": 0.0, "kl": 0.22205714881420135, "learning_rate": 3.5427714398156267e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6617 }, { "completion_length": 598.5, "epoch": 1.8342572062084257, "grad_norm": 0.0, "kl": 0.23183056712150574, "learning_rate": 3.5423735339990633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6618 }, { "completion_length": 583.75, "epoch": 1.8345343680709534, "grad_norm": 1.8233344554901123, "kl": 0.7372397184371948, "learning_rate": 3.54197559621724e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6619 }, { "completion_length": 691.0, "epoch": 1.8348115299334813, "grad_norm": 0.0, "kl": 0.3576514422893524, "learning_rate": 3.5415776264823586e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6620 }, { "completion_length": 621.25, "epoch": 1.8350886917960088, "grad_norm": 0.0, "kl": 4543.0673828125, "learning_rate": 3.541179624806625e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6621 }, { "completion_length": 537.75, "epoch": 1.8353658536585367, "grad_norm": 0.0, "kl": 0.22998341917991638, "learning_rate": 3.540781591202244e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6622 }, { "completion_length": 668.0, "epoch": 1.8356430155210643, "grad_norm": 0.0, "kl": 0.25638547539711, "learning_rate": 3.5403835256814197e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6623 }, { "completion_length": 557.5, "epoch": 1.835920177383592, "grad_norm": 0.0, "kl": 0.9064992070198059, "learning_rate": 3.539985428256362e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6624 }, { "completion_length": 583.0, "epoch": 1.8361973392461197, "grad_norm": 0.0, "kl": 0.19429953396320343, "learning_rate": 3.539587298939276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6625 }, { "completion_length": 520.25, "epoch": 1.8364745011086474, "grad_norm": 0.0, "kl": 0.4045923352241516, "learning_rate": 3.5391891377423734e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6626 }, { "completion_length": 597.75, "epoch": 1.8367516629711753, "grad_norm": 0.0, "kl": 0.1876770257949829, "learning_rate": 3.5387909446778623e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6627 }, { "completion_length": 535.0, "epoch": 1.8370288248337028, "grad_norm": 0.0, "kl": 0.3085348308086395, "learning_rate": 3.5383927197579537e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6628 }, { "completion_length": 600.75, "epoch": 1.8373059866962307, "grad_norm": 0.0, "kl": 0.24235029518604279, "learning_rate": 3.537994462994861e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6629 }, { "completion_length": 552.25, "epoch": 1.8375831485587582, "grad_norm": 0.0, "kl": 0.22340244054794312, "learning_rate": 3.5375961744007954e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6630 }, { "completion_length": 584.0, "epoch": 1.837860310421286, "grad_norm": 0.0, "kl": 0.5196858644485474, "learning_rate": 3.5371978539879715e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6631 }, { "completion_length": 584.5, "epoch": 1.8381374722838137, "grad_norm": 0.0, "kl": 0.4224897027015686, "learning_rate": 3.536799501768604e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6632 }, { "completion_length": 570.5, "epoch": 1.8384146341463414, "grad_norm": 0.0, "kl": 10524.28515625, "learning_rate": 3.5364011177549095e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6633 }, { "completion_length": 619.25, "epoch": 1.8386917960088693, "grad_norm": 0.0, "kl": 0.18644562363624573, "learning_rate": 3.536002701959103e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6634 }, { "completion_length": 627.0, "epoch": 1.8389689578713968, "grad_norm": 0.0, "kl": 1.0108160972595215, "learning_rate": 3.5356042543934045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6635 }, { "completion_length": 636.0, "epoch": 1.8392461197339247, "grad_norm": 0.0, "kl": 0.23508453369140625, "learning_rate": 3.53520577507003e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6636 }, { "completion_length": 627.0, "epoch": 1.8395232815964522, "grad_norm": 0.0, "kl": 0.18386249244213104, "learning_rate": 3.534807264001202e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6637 }, { "completion_length": 640.5, "epoch": 1.83980044345898, "grad_norm": 0.0, "kl": 0.20703189074993134, "learning_rate": 3.534408721199139e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6638 }, { "completion_length": 556.0, "epoch": 1.8400776053215078, "grad_norm": 0.0, "kl": 2.1357383728027344, "learning_rate": 3.5340101466760646e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6639 }, { "completion_length": 582.0, "epoch": 1.8403547671840355, "grad_norm": 0.0, "kl": 0.18554045259952545, "learning_rate": 3.5336115404441996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6640 }, { "completion_length": 637.25, "epoch": 1.8406319290465631, "grad_norm": 0.0, "kl": 0.17977961897850037, "learning_rate": 3.5332129025157684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6641 }, { "completion_length": 540.5, "epoch": 1.8409090909090908, "grad_norm": 0.638360857963562, "kl": 468416.28125, "learning_rate": 3.532814232902996e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6642 }, { "completion_length": 681.0, "epoch": 1.8411862527716187, "grad_norm": 0.0, "kl": 0.20676107704639435, "learning_rate": 3.532415531618107e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6643 }, { "completion_length": 639.0, "epoch": 1.8414634146341462, "grad_norm": 0.0, "kl": 0.1939307153224945, "learning_rate": 3.532016798673329e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6644 }, { "completion_length": 625.25, "epoch": 1.841740576496674, "grad_norm": 0.0, "kl": 0.2863447368144989, "learning_rate": 3.531618034080888e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6645 }, { "completion_length": 634.75, "epoch": 1.8420177383592018, "grad_norm": 0.0, "kl": 0.24647261202335358, "learning_rate": 3.5312192378530134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6646 }, { "completion_length": 491.5, "epoch": 1.8422949002217295, "grad_norm": 0.0, "kl": 0.18972261250019073, "learning_rate": 3.530820410001935e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6647 }, { "completion_length": 633.75, "epoch": 1.8425720620842572, "grad_norm": 0.0, "kl": 0.21270331740379333, "learning_rate": 3.5304215505398824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6648 }, { "completion_length": 625.25, "epoch": 1.8428492239467849, "grad_norm": 0.0, "kl": 0.9338515996932983, "learning_rate": 3.5300226594790876e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6649 }, { "completion_length": 646.75, "epoch": 1.8431263858093128, "grad_norm": 0.39208969473838806, "kl": 162357.125, "learning_rate": 3.529623736831783e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6650 }, { "completion_length": 756.75, "epoch": 1.8434035476718402, "grad_norm": 0.0, "kl": 0.18324850499629974, "learning_rate": 3.5292247826102003e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6651 }, { "completion_length": 503.5, "epoch": 1.8436807095343681, "grad_norm": 0.0, "kl": 0.9655640721321106, "learning_rate": 3.5288257968265766e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6652 }, { "completion_length": 603.75, "epoch": 1.8439578713968958, "grad_norm": 0.0, "kl": 0.20017066597938538, "learning_rate": 3.5284267794931437e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6653 }, { "completion_length": 615.75, "epoch": 1.8442350332594235, "grad_norm": 0.0, "kl": 0.28314173221588135, "learning_rate": 3.528027730622141e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6654 }, { "completion_length": 604.0, "epoch": 1.8445121951219512, "grad_norm": 0.0, "kl": 2.955944538116455, "learning_rate": 3.527628650225804e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6655 }, { "completion_length": 717.5, "epoch": 1.8447893569844789, "grad_norm": 0.44406673312187195, "kl": 0.15618611872196198, "learning_rate": 3.527229538316371e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6656 }, { "completion_length": 568.25, "epoch": 1.8450665188470068, "grad_norm": 0.41853460669517517, "kl": 6965627.0, "learning_rate": 3.526830394906081e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6657 }, { "completion_length": 719.25, "epoch": 1.8453436807095343, "grad_norm": 0.0, "kl": 0.17918150126934052, "learning_rate": 3.5264312200071744e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6658 }, { "completion_length": 523.0, "epoch": 1.8456208425720622, "grad_norm": 0.0, "kl": 0.2712196111679077, "learning_rate": 3.5260320136318927e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6659 }, { "completion_length": 657.5, "epoch": 1.8458980044345898, "grad_norm": 0.0, "kl": 7139320.5, "learning_rate": 3.525632775792477e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6660 }, { "completion_length": 628.0, "epoch": 1.8461751662971175, "grad_norm": 0.0, "kl": 0.2249557077884674, "learning_rate": 3.52523350650117e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6661 }, { "completion_length": 601.5, "epoch": 1.8464523281596452, "grad_norm": 0.0, "kl": 0.23952774703502655, "learning_rate": 3.5248342057702164e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6662 }, { "completion_length": 493.75, "epoch": 1.846729490022173, "grad_norm": 0.0, "kl": 0.3197472095489502, "learning_rate": 3.5244348736118617e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6663 }, { "completion_length": 442.75, "epoch": 1.8470066518847008, "grad_norm": 0.0, "kl": 0.21040362119674683, "learning_rate": 3.52403551003835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6664 }, { "completion_length": 566.0, "epoch": 1.8472838137472283, "grad_norm": 0.3708150088787079, "kl": 0.2019750028848648, "learning_rate": 3.5236361150619302e-06, "loss": 0.0, "reward": 1.5, "reward_std": 0.5, "rewards/confident_score_func": -0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6665 }, { "completion_length": 581.25, "epoch": 1.8475609756097562, "grad_norm": 0.0, "kl": 0.23575738072395325, "learning_rate": 3.523236688694848e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6666 }, { "completion_length": 587.0, "epoch": 1.8478381374722836, "grad_norm": 0.0, "kl": 0.3223797380924225, "learning_rate": 3.5228372309493534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6667 }, { "completion_length": 567.75, "epoch": 1.8481152993348116, "grad_norm": 0.0, "kl": 0.1886196881532669, "learning_rate": 3.522437741837695e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6668 }, { "completion_length": 536.0, "epoch": 1.8483924611973392, "grad_norm": 0.6655518412590027, "kl": 255827.8125, "learning_rate": 3.522038221372126e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6669 }, { "completion_length": 578.0, "epoch": 1.848669623059867, "grad_norm": 0.0, "kl": 0.2427709996700287, "learning_rate": 3.521638669564895e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6670 }, { "completion_length": 720.0, "epoch": 1.8489467849223948, "grad_norm": 0.0, "kl": 0.1540307253599167, "learning_rate": 3.521239086428257e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6671 }, { "completion_length": 592.0, "epoch": 1.8492239467849223, "grad_norm": 0.0, "kl": 0.2158331573009491, "learning_rate": 3.520839471974464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6672 }, { "completion_length": 632.25, "epoch": 1.8495011086474502, "grad_norm": 0.0, "kl": 0.14812245965003967, "learning_rate": 3.5204398262157707e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6673 }, { "completion_length": 544.0, "epoch": 1.8497782705099777, "grad_norm": 0.0, "kl": 0.24939510226249695, "learning_rate": 3.5200401491644333e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6674 }, { "completion_length": 593.75, "epoch": 1.8500554323725056, "grad_norm": 0.0, "kl": 0.16446419060230255, "learning_rate": 3.5196404408327074e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6675 }, { "completion_length": 601.25, "epoch": 1.8503325942350333, "grad_norm": 0.0, "kl": 0.4097975790500641, "learning_rate": 3.5192407012328505e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6676 }, { "completion_length": 562.75, "epoch": 1.850609756097561, "grad_norm": 0.0, "kl": 0.22681395709514618, "learning_rate": 3.5188409303771216e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6677 }, { "completion_length": 525.75, "epoch": 1.8508869179600886, "grad_norm": 0.5162711143493652, "kl": 77672280.0, "learning_rate": 3.5184411282777795e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6678 }, { "completion_length": 538.25, "epoch": 1.8511640798226163, "grad_norm": 0.0, "kl": 0.20423206686973572, "learning_rate": 3.518041294947085e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6679 }, { "completion_length": 626.25, "epoch": 1.8514412416851442, "grad_norm": 0.0, "kl": 0.251082181930542, "learning_rate": 3.517641430397299e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6680 }, { "completion_length": 533.75, "epoch": 1.8517184035476717, "grad_norm": 0.0, "kl": 0.24255815148353577, "learning_rate": 3.517241534640683e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6681 }, { "completion_length": 512.5, "epoch": 1.8519955654101996, "grad_norm": 0.0, "kl": 0.19059127569198608, "learning_rate": 3.516841607689501e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6682 }, { "completion_length": 567.75, "epoch": 1.8522727272727273, "grad_norm": 0.7964573502540588, "kl": 4945787.5, "learning_rate": 3.5164416495560167e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6683 }, { "completion_length": 570.25, "epoch": 1.852549889135255, "grad_norm": 0.0, "kl": 0.22938454151153564, "learning_rate": 3.5160416602524957e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6684 }, { "completion_length": 667.5, "epoch": 1.8528270509977827, "grad_norm": 0.0, "kl": 0.18997898697853088, "learning_rate": 3.515641639791203e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6685 }, { "completion_length": 603.0, "epoch": 1.8531042128603104, "grad_norm": 0.0, "kl": 0.22123387455940247, "learning_rate": 3.515241588184407e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6686 }, { "completion_length": 554.5, "epoch": 1.8533813747228383, "grad_norm": 0.0, "kl": 0.25180092453956604, "learning_rate": 3.514841505444374e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6687 }, { "completion_length": 555.0, "epoch": 1.8536585365853657, "grad_norm": 0.5006421804428101, "kl": 76147336.0, "learning_rate": 3.5144413915833745e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6688 }, { "completion_length": 565.25, "epoch": 1.8539356984478936, "grad_norm": 0.0, "kl": 0.18039721250534058, "learning_rate": 3.514041246613677e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6689 }, { "completion_length": 582.0, "epoch": 1.8542128603104213, "grad_norm": 0.0, "kl": 360.5760498046875, "learning_rate": 3.513641070547553e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6690 }, { "completion_length": 563.5, "epoch": 1.854490022172949, "grad_norm": 0.0, "kl": 0.18866094946861267, "learning_rate": 3.513240863397273e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6691 }, { "completion_length": 529.0, "epoch": 1.8547671840354767, "grad_norm": 0.0, "kl": 0.219736710190773, "learning_rate": 3.5128406251751117e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6692 }, { "completion_length": 524.75, "epoch": 1.8550443458980044, "grad_norm": 0.0, "kl": 0.6151033639907837, "learning_rate": 3.512440355893342e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6693 }, { "completion_length": 641.25, "epoch": 1.8553215077605323, "grad_norm": 1.8586992025375366, "kl": 15056419.0, "learning_rate": 3.512040055564237e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6694 }, { "completion_length": 498.0, "epoch": 1.8555986696230597, "grad_norm": 0.0, "kl": 0.30248066782951355, "learning_rate": 3.5116397242000748e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6695 }, { "completion_length": 654.0, "epoch": 1.8558758314855877, "grad_norm": 0.0, "kl": 0.21419154107570648, "learning_rate": 3.5112393618131295e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6696 }, { "completion_length": 550.0, "epoch": 1.8561529933481153, "grad_norm": 0.0, "kl": 0.20650982856750488, "learning_rate": 3.510838968415681e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6697 }, { "completion_length": 522.75, "epoch": 1.856430155210643, "grad_norm": 0.0, "kl": 0.23122230172157288, "learning_rate": 3.5104385440200043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6698 }, { "completion_length": 537.0, "epoch": 1.8567073170731707, "grad_norm": 0.0, "kl": 53272576.0, "learning_rate": 3.5100380886383824e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6699 }, { "completion_length": 488.25, "epoch": 1.8569844789356984, "grad_norm": 0.0, "kl": 0.1613663136959076, "learning_rate": 3.5096376022830937e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6700 }, { "completion_length": 566.25, "epoch": 1.8572616407982263, "grad_norm": 0.0, "kl": 0.894834041595459, "learning_rate": 3.5092370849664198e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6701 }, { "completion_length": 516.5, "epoch": 1.8575388026607538, "grad_norm": 0.0, "kl": 0.26284050941467285, "learning_rate": 3.508836536700642e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6702 }, { "completion_length": 592.5, "epoch": 1.8578159645232817, "grad_norm": 0.0, "kl": 0.19934971630573273, "learning_rate": 3.5084359574980452e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6703 }, { "completion_length": 574.0, "epoch": 1.8580931263858091, "grad_norm": 0.44958069920539856, "kl": 83243784.0, "learning_rate": 3.508035347370912e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6704 }, { "completion_length": 563.0, "epoch": 1.858370288248337, "grad_norm": 0.0, "kl": 0.2257297933101654, "learning_rate": 3.5076347063315287e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6705 }, { "completion_length": 573.75, "epoch": 1.8586474501108647, "grad_norm": 0.0, "kl": 0.7287028431892395, "learning_rate": 3.5072340343921797e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6706 }, { "completion_length": 526.5, "epoch": 1.8589246119733924, "grad_norm": 0.0, "kl": 0.19074872136116028, "learning_rate": 3.506833331565154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6707 }, { "completion_length": 518.0, "epoch": 1.8592017738359203, "grad_norm": 0.0, "kl": 0.20813794434070587, "learning_rate": 3.5064325978627365e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6708 }, { "completion_length": 565.0, "epoch": 1.8594789356984478, "grad_norm": 0.0, "kl": 0.21344220638275146, "learning_rate": 3.5060318332972193e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6709 }, { "completion_length": 458.75, "epoch": 1.8597560975609757, "grad_norm": 0.0, "kl": 0.2396698296070099, "learning_rate": 3.505631037880891e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6710 }, { "completion_length": 545.25, "epoch": 1.8600332594235032, "grad_norm": 0.0, "kl": 0.16631540656089783, "learning_rate": 3.505230211626041e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6711 }, { "completion_length": 518.25, "epoch": 1.860310421286031, "grad_norm": 0.0, "kl": 315300832.0, "learning_rate": 3.504829354544963e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6712 }, { "completion_length": 637.5, "epoch": 1.8605875831485588, "grad_norm": 0.0, "kl": 0.2350558191537857, "learning_rate": 3.504428466649948e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6713 }, { "completion_length": 564.25, "epoch": 1.8608647450110865, "grad_norm": 0.0, "kl": 0.1814207285642624, "learning_rate": 3.504027547953291e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6714 }, { "completion_length": 671.75, "epoch": 1.8611419068736141, "grad_norm": 0.3695211410522461, "kl": 18963488.0, "learning_rate": 3.503626598467285e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6715 }, { "completion_length": 530.5, "epoch": 1.8614190687361418, "grad_norm": 0.0, "kl": 0.2437182366847992, "learning_rate": 3.503225618204227e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6716 }, { "completion_length": 500.75, "epoch": 1.8616962305986697, "grad_norm": 0.0, "kl": 0.28446418046951294, "learning_rate": 3.5028246071764117e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6717 }, { "completion_length": 512.0, "epoch": 1.8619733924611972, "grad_norm": 0.0, "kl": 0.6994664072990417, "learning_rate": 3.502423565396138e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6718 }, { "completion_length": 564.0, "epoch": 1.862250554323725, "grad_norm": 0.0, "kl": 0.5505864024162292, "learning_rate": 3.5020224928757037e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6719 }, { "completion_length": 561.5, "epoch": 1.8625277161862528, "grad_norm": 0.0, "kl": 0.18822011351585388, "learning_rate": 3.501621389627407e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6720 }, { "completion_length": 570.0, "epoch": 1.8628048780487805, "grad_norm": 0.4686879515647888, "kl": 5399230464.0, "learning_rate": 3.501220255663549e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6721 }, { "completion_length": 494.75, "epoch": 1.8630820399113082, "grad_norm": 0.0, "kl": 0.26010048389434814, "learning_rate": 3.5008190909964318e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6722 }, { "completion_length": 503.75, "epoch": 1.8633592017738358, "grad_norm": 0.4356233775615692, "kl": 13104285696.0, "learning_rate": 3.5004178956383556e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6723 }, { "completion_length": 562.25, "epoch": 1.8636363636363638, "grad_norm": 0.0, "kl": 0.2326626479625702, "learning_rate": 3.500016669601625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6724 }, { "completion_length": 492.75, "epoch": 1.8639135254988912, "grad_norm": 0.0, "kl": 927128768.0, "learning_rate": 3.4996154128985414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6725 }, { "completion_length": 571.0, "epoch": 1.8641906873614191, "grad_norm": 0.0, "kl": 0.2325304001569748, "learning_rate": 3.4992141255414126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6726 }, { "completion_length": 531.25, "epoch": 1.8644678492239468, "grad_norm": 0.0, "kl": 0.20198236405849457, "learning_rate": 3.498812807542543e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6727 }, { "completion_length": 635.75, "epoch": 1.8647450110864745, "grad_norm": 0.0, "kl": 0.204505056142807, "learning_rate": 3.4984114589142388e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6728 }, { "completion_length": 539.5, "epoch": 1.8650221729490022, "grad_norm": 0.0, "kl": 0.20483340322971344, "learning_rate": 3.49801007966881e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6729 }, { "completion_length": 558.25, "epoch": 1.8652993348115299, "grad_norm": 0.42808887362480164, "kl": 18010505216.0, "learning_rate": 3.4976086698185622e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6730 }, { "completion_length": 526.5, "epoch": 1.8655764966740578, "grad_norm": 0.0, "kl": 0.17617304623126984, "learning_rate": 3.4972072293758074e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6731 }, { "completion_length": 529.25, "epoch": 1.8658536585365852, "grad_norm": 0.0, "kl": 153675169792.0, "learning_rate": 3.496805758352855e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6732 }, { "completion_length": 557.5, "epoch": 1.8661308203991132, "grad_norm": 0.0, "kl": 0.23386405408382416, "learning_rate": 3.4964042567620165e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6733 }, { "completion_length": 514.25, "epoch": 1.8664079822616408, "grad_norm": 0.0, "kl": 0.25190627574920654, "learning_rate": 3.4960027246156043e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6734 }, { "completion_length": 587.75, "epoch": 1.8666851441241685, "grad_norm": 0.5216541290283203, "kl": 764644160.0, "learning_rate": 3.495601161925932e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6735 }, { "completion_length": 482.5, "epoch": 1.8669623059866962, "grad_norm": 0.0, "kl": 0.22787189483642578, "learning_rate": 3.495199568705313e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6736 }, { "completion_length": 548.0, "epoch": 1.867239467849224, "grad_norm": 0.0, "kl": 0.15736685693264008, "learning_rate": 3.494797944966064e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6737 }, { "completion_length": 580.5, "epoch": 1.8675166297117518, "grad_norm": 0.0, "kl": 0.20099689066410065, "learning_rate": 3.4943962907204996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6738 }, { "completion_length": 572.25, "epoch": 1.8677937915742793, "grad_norm": 0.0, "kl": 0.193587988615036, "learning_rate": 3.493994605980938e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6739 }, { "completion_length": 517.0, "epoch": 1.8680709534368072, "grad_norm": 0.0, "kl": 0.206930011510849, "learning_rate": 3.493592890759697e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6740 }, { "completion_length": 505.25, "epoch": 1.8683481152993349, "grad_norm": 0.0, "kl": 37885628416.0, "learning_rate": 3.493191145069094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6741 }, { "completion_length": 568.25, "epoch": 1.8686252771618626, "grad_norm": 0.0, "kl": 0.235227569937706, "learning_rate": 3.492789368921451e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6742 }, { "completion_length": 504.0, "epoch": 1.8689024390243902, "grad_norm": 0.0, "kl": 0.2971622049808502, "learning_rate": 3.4923875623290886e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6743 }, { "completion_length": 562.5, "epoch": 1.869179600886918, "grad_norm": 0.476200133562088, "kl": 695542939648.0, "learning_rate": 3.491985725304327e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6744 }, { "completion_length": 490.75, "epoch": 1.8694567627494458, "grad_norm": 0.0, "kl": 0.2527649700641632, "learning_rate": 3.49158385785949e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6745 }, { "completion_length": 486.0, "epoch": 1.8697339246119733, "grad_norm": 0.0, "kl": 0.25492051243782043, "learning_rate": 3.4911819600069015e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6746 }, { "completion_length": 498.0, "epoch": 1.8700110864745012, "grad_norm": 0.0, "kl": 0.22298277914524078, "learning_rate": 3.4907800317588845e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6747 }, { "completion_length": 536.25, "epoch": 1.8702882483370287, "grad_norm": 0.0, "kl": 0.22749902307987213, "learning_rate": 3.4903780731277665e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6748 }, { "completion_length": 566.25, "epoch": 1.8705654101995566, "grad_norm": 0.0, "kl": 0.298122763633728, "learning_rate": 3.489976084125872e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6749 }, { "completion_length": 633.25, "epoch": 1.8708425720620843, "grad_norm": 0.0, "kl": 0.20824061334133148, "learning_rate": 3.4895740647655297e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6750 }, { "completion_length": 701.5, "epoch": 1.871119733924612, "grad_norm": 0.0, "kl": 0.1556420773267746, "learning_rate": 3.489172015059067e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6751 }, { "completion_length": 554.25, "epoch": 1.8713968957871396, "grad_norm": 0.0, "kl": 5.640300273895264, "learning_rate": 3.488769935018814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6752 }, { "completion_length": 494.25, "epoch": 1.8716740576496673, "grad_norm": 0.0, "kl": 0.18141186237335205, "learning_rate": 3.4883678246570995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6753 }, { "completion_length": 690.0, "epoch": 1.8719512195121952, "grad_norm": 0.0, "kl": 0.1863013356924057, "learning_rate": 3.4879656839862564e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6754 }, { "completion_length": 553.75, "epoch": 1.8722283813747227, "grad_norm": 0.0, "kl": 0.22773385047912598, "learning_rate": 3.4875635130186148e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6755 }, { "completion_length": 635.0, "epoch": 1.8725055432372506, "grad_norm": 0.0, "kl": 0.2669469714164734, "learning_rate": 3.4871613117665087e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6756 }, { "completion_length": 609.75, "epoch": 1.8727827050997783, "grad_norm": 0.0, "kl": 0.18245166540145874, "learning_rate": 3.4867590802422713e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6757 }, { "completion_length": 529.25, "epoch": 1.873059866962306, "grad_norm": 0.0, "kl": 0.21884234249591827, "learning_rate": 3.4863568184582385e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6758 }, { "completion_length": 459.25, "epoch": 1.8733370288248337, "grad_norm": 0.0, "kl": 0.22151929140090942, "learning_rate": 3.4859545264267443e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6759 }, { "completion_length": 533.0, "epoch": 1.8736141906873613, "grad_norm": 0.0, "kl": 0.22054627537727356, "learning_rate": 3.4855522041601265e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6760 }, { "completion_length": 486.25, "epoch": 1.8738913525498893, "grad_norm": 0.0, "kl": 0.2550632357597351, "learning_rate": 3.4851498516707226e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6761 }, { "completion_length": 568.25, "epoch": 1.8741685144124167, "grad_norm": 0.0, "kl": 0.20321889221668243, "learning_rate": 3.484747468970871e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6762 }, { "completion_length": 655.0, "epoch": 1.8744456762749446, "grad_norm": 0.0, "kl": 0.21015194058418274, "learning_rate": 3.4843450560729107e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6763 }, { "completion_length": 540.0, "epoch": 1.8747228381374723, "grad_norm": 0.0, "kl": 0.20154644548892975, "learning_rate": 3.483942612989183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6764 }, { "completion_length": 555.5, "epoch": 1.875, "grad_norm": 0.0, "kl": 10.615647315979004, "learning_rate": 3.4835401397320273e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6765 }, { "completion_length": 573.0, "epoch": 1.8752771618625277, "grad_norm": 0.4693404734134674, "kl": 0.19137336313724518, "learning_rate": 3.483137636313787e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6766 }, { "completion_length": 537.0, "epoch": 1.8755543237250554, "grad_norm": 0.0, "kl": 0.2103600800037384, "learning_rate": 3.482735102746806e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6767 }, { "completion_length": 481.0, "epoch": 1.8758314855875833, "grad_norm": 0.0, "kl": 0.2505750358104706, "learning_rate": 3.482332539043427e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6768 }, { "completion_length": 467.0, "epoch": 1.8761086474501107, "grad_norm": 0.0, "kl": 0.20347252488136292, "learning_rate": 3.481929945215996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6769 }, { "completion_length": 580.25, "epoch": 1.8763858093126387, "grad_norm": 0.0, "kl": 0.19134840369224548, "learning_rate": 3.481527321276858e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6770 }, { "completion_length": 685.0, "epoch": 1.8766629711751663, "grad_norm": 0.33697572350502014, "kl": 2163351355392.0, "learning_rate": 3.4811246672383604e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6771 }, { "completion_length": 485.5, "epoch": 1.876940133037694, "grad_norm": 0.0, "kl": 0.25788557529449463, "learning_rate": 3.480721983112849e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6772 }, { "completion_length": 600.0, "epoch": 1.8772172949002217, "grad_norm": 0.0, "kl": 0.22694605588912964, "learning_rate": 3.480319268912676e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6773 }, { "completion_length": 557.0, "epoch": 1.8774944567627494, "grad_norm": 0.0, "kl": 2.3980724811553955, "learning_rate": 3.479916524650188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6774 }, { "completion_length": 449.25, "epoch": 1.8777716186252773, "grad_norm": 0.0, "kl": 0.24366232752799988, "learning_rate": 3.479513750337737e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6775 }, { "completion_length": 515.0, "epoch": 1.8780487804878048, "grad_norm": 0.0, "kl": 0.18320332467556, "learning_rate": 3.479110945987674e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6776 }, { "completion_length": 474.75, "epoch": 1.8783259423503327, "grad_norm": 0.0, "kl": 0.32055890560150146, "learning_rate": 3.4787081116123507e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6777 }, { "completion_length": 529.25, "epoch": 1.8786031042128604, "grad_norm": 0.0, "kl": 0.2173995077610016, "learning_rate": 3.478305247224122e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6778 }, { "completion_length": 601.0, "epoch": 1.878880266075388, "grad_norm": 0.0, "kl": 0.1895032674074173, "learning_rate": 3.47790235283534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6779 }, { "completion_length": 519.0, "epoch": 1.8791574279379157, "grad_norm": 0.5956296324729919, "kl": 326411.78125, "learning_rate": 3.4774994284583607e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6780 }, { "completion_length": 526.75, "epoch": 1.8794345898004434, "grad_norm": 0.0, "kl": 0.2347564995288849, "learning_rate": 3.4770964741055406e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6781 }, { "completion_length": 544.75, "epoch": 1.8797117516629713, "grad_norm": 0.0, "kl": 0.3848528265953064, "learning_rate": 3.476693489789237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6782 }, { "completion_length": 505.5, "epoch": 1.8799889135254988, "grad_norm": 0.0, "kl": 0.1703839898109436, "learning_rate": 3.476290475521806e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6783 }, { "completion_length": 657.75, "epoch": 1.8802660753880267, "grad_norm": 0.0, "kl": 35970.5625, "learning_rate": 3.475887431315608e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6784 }, { "completion_length": 527.75, "epoch": 1.8805432372505542, "grad_norm": 0.0, "kl": 0.16285130381584167, "learning_rate": 3.4754843571830022e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6785 }, { "completion_length": 509.75, "epoch": 1.880820399113082, "grad_norm": 0.0, "kl": 0.21177656948566437, "learning_rate": 3.4750812531363486e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6786 }, { "completion_length": 665.75, "epoch": 1.8810975609756098, "grad_norm": 0.0, "kl": 0.15702128410339355, "learning_rate": 3.4746781191880086e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6787 }, { "completion_length": 499.0, "epoch": 1.8813747228381374, "grad_norm": 0.5362765192985535, "kl": 1270821289984.0, "learning_rate": 3.474274955350346e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6788 }, { "completion_length": 551.0, "epoch": 1.8816518847006651, "grad_norm": 0.0, "kl": 6468571496448.0, "learning_rate": 3.473871761635724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6789 }, { "completion_length": 522.75, "epoch": 1.8819290465631928, "grad_norm": 0.0, "kl": 0.23255541920661926, "learning_rate": 3.473468538056506e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6790 }, { "completion_length": 544.0, "epoch": 1.8822062084257207, "grad_norm": 0.0, "kl": 0.19592644274234772, "learning_rate": 3.4730652846250566e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6791 }, { "completion_length": 570.75, "epoch": 1.8824833702882482, "grad_norm": 0.0, "kl": 0.16506017744541168, "learning_rate": 3.4726620013537425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6792 }, { "completion_length": 494.5, "epoch": 1.882760532150776, "grad_norm": 0.0, "kl": 0.23010008037090302, "learning_rate": 3.4722586882549326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6793 }, { "completion_length": 573.25, "epoch": 1.8830376940133038, "grad_norm": 0.0, "kl": 81348788224.0, "learning_rate": 3.471855345340992e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6794 }, { "completion_length": 550.25, "epoch": 1.8833148558758315, "grad_norm": 0.0, "kl": 0.3003807067871094, "learning_rate": 3.4714519726242912e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6795 }, { "completion_length": 591.25, "epoch": 1.8835920177383592, "grad_norm": 0.3482559621334076, "kl": 0.22002491354942322, "learning_rate": 3.4710485701171994e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6796 }, { "completion_length": 604.25, "epoch": 1.8838691796008868, "grad_norm": 0.0, "kl": 0.2234349399805069, "learning_rate": 3.4706451378320875e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6797 }, { "completion_length": 562.25, "epoch": 1.8841463414634148, "grad_norm": 0.0, "kl": 0.2340071201324463, "learning_rate": 3.4702416757813272e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6798 }, { "completion_length": 504.0, "epoch": 1.8844235033259422, "grad_norm": 0.0, "kl": 0.30069267749786377, "learning_rate": 3.469838183977291e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6799 }, { "completion_length": 530.75, "epoch": 1.8847006651884701, "grad_norm": 0.0, "kl": 0.17207324504852295, "learning_rate": 3.469434662432352e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6800 }, { "completion_length": 473.75, "epoch": 1.8849778270509978, "grad_norm": 0.0, "kl": 0.39038410782814026, "learning_rate": 3.4690311111588844e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6801 }, { "completion_length": 501.25, "epoch": 1.8852549889135255, "grad_norm": 0.0, "kl": 0.21504953503608704, "learning_rate": 3.468627530169263e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6802 }, { "completion_length": 522.0, "epoch": 1.8855321507760532, "grad_norm": 0.41202157735824585, "kl": 27985442766848.0, "learning_rate": 3.4682239194758654e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6803 }, { "completion_length": 458.0, "epoch": 1.8858093126385809, "grad_norm": 0.0, "kl": 0.20864471793174744, "learning_rate": 3.4678202790910675e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6804 }, { "completion_length": 537.25, "epoch": 1.8860864745011088, "grad_norm": 0.0, "kl": 0.18348576128482819, "learning_rate": 3.4674166090272487e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6805 }, { "completion_length": 484.0, "epoch": 1.8863636363636362, "grad_norm": 0.0, "kl": 0.258492648601532, "learning_rate": 3.4670129092967865e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6806 }, { "completion_length": 500.5, "epoch": 1.8866407982261642, "grad_norm": 0.0, "kl": 0.21790055930614471, "learning_rate": 3.46660917991206e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6807 }, { "completion_length": 502.75, "epoch": 1.8869179600886918, "grad_norm": 0.0, "kl": 0.36700308322906494, "learning_rate": 3.4662054208854524e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6808 }, { "completion_length": 483.25, "epoch": 1.8871951219512195, "grad_norm": 0.0, "kl": 0.45562130212783813, "learning_rate": 3.4658016322293426e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6809 }, { "completion_length": 550.5, "epoch": 1.8874722838137472, "grad_norm": 0.46904852986335754, "kl": 2284.2392578125, "learning_rate": 3.465397813956115e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6810 }, { "completion_length": 502.0, "epoch": 1.887749445676275, "grad_norm": 0.0, "kl": 0.21858853101730347, "learning_rate": 3.4649939660781517e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6811 }, { "completion_length": 484.5, "epoch": 1.8880266075388028, "grad_norm": 0.0, "kl": 0.19604206085205078, "learning_rate": 3.4645900886078388e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6812 }, { "completion_length": 545.75, "epoch": 1.8883037694013303, "grad_norm": 0.0, "kl": 0.2820665240287781, "learning_rate": 3.46418618155756e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6813 }, { "completion_length": 532.25, "epoch": 1.8885809312638582, "grad_norm": 0.0, "kl": 0.18185706436634064, "learning_rate": 3.463782244939702e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6814 }, { "completion_length": 482.0, "epoch": 1.8888580931263859, "grad_norm": 0.0, "kl": 0.255270391702652, "learning_rate": 3.4633782787666516e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6815 }, { "completion_length": 528.75, "epoch": 1.8891352549889135, "grad_norm": 0.49868738651275635, "kl": 1341315874816.0, "learning_rate": 3.462974283050797e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6816 }, { "completion_length": 551.75, "epoch": 1.8894124168514412, "grad_norm": 0.0, "kl": 390820134912.0, "learning_rate": 3.462570257804527e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6817 }, { "completion_length": 495.25, "epoch": 1.889689578713969, "grad_norm": 0.0, "kl": 0.22737057507038116, "learning_rate": 3.4621662030402315e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6818 }, { "completion_length": 542.75, "epoch": 1.8899667405764968, "grad_norm": 0.0, "kl": 0.2584047019481659, "learning_rate": 3.461762118770301e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6819 }, { "completion_length": 527.5, "epoch": 1.8902439024390243, "grad_norm": 0.0, "kl": 0.20918618142604828, "learning_rate": 3.461358005007128e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6820 }, { "completion_length": 580.0, "epoch": 1.8905210643015522, "grad_norm": 0.0, "kl": 0.17476022243499756, "learning_rate": 3.460953861763103e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6821 }, { "completion_length": 522.5, "epoch": 1.8907982261640797, "grad_norm": 0.0, "kl": 0.2045501172542572, "learning_rate": 3.4605496890506207e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6822 }, { "completion_length": 554.25, "epoch": 1.8910753880266076, "grad_norm": 0.0, "kl": 0.21576407551765442, "learning_rate": 3.4601454868820757e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6823 }, { "completion_length": 518.75, "epoch": 1.8913525498891353, "grad_norm": 0.0, "kl": 0.2525082528591156, "learning_rate": 3.4597412552698617e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6824 }, { "completion_length": 504.75, "epoch": 1.891629711751663, "grad_norm": 0.0, "kl": 0.2728814482688904, "learning_rate": 3.4593369942263766e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6825 }, { "completion_length": 585.0, "epoch": 1.8919068736141909, "grad_norm": 0.0, "kl": 0.26227426528930664, "learning_rate": 3.4589327037640163e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6826 }, { "completion_length": 527.0, "epoch": 1.8921840354767183, "grad_norm": 0.0, "kl": 0.17655843496322632, "learning_rate": 3.45852838389518e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6827 }, { "completion_length": 537.0, "epoch": 1.8924611973392462, "grad_norm": 0.0, "kl": 0.2107314020395279, "learning_rate": 3.458124034632264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6828 }, { "completion_length": 467.25, "epoch": 1.8927383592017737, "grad_norm": 0.0, "kl": 0.2527773082256317, "learning_rate": 3.4577196559876715e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6829 }, { "completion_length": 510.75, "epoch": 1.8930155210643016, "grad_norm": 0.0, "kl": 0.20582248270511627, "learning_rate": 3.4573152479737993e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6830 }, { "completion_length": 596.0, "epoch": 1.8932926829268293, "grad_norm": 0.0, "kl": 0.19692543148994446, "learning_rate": 3.456910810603052e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6831 }, { "completion_length": 501.25, "epoch": 1.893569844789357, "grad_norm": 0.0, "kl": 1156239982592.0, "learning_rate": 3.45650634388783e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6832 }, { "completion_length": 538.75, "epoch": 1.8938470066518847, "grad_norm": 0.0, "kl": 0.21731553971767426, "learning_rate": 3.4561018478405378e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6833 }, { "completion_length": 635.25, "epoch": 1.8941241685144123, "grad_norm": 0.0, "kl": 0.15270011126995087, "learning_rate": 3.455697322473579e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6834 }, { "completion_length": 453.0, "epoch": 1.8944013303769403, "grad_norm": 0.0, "kl": 0.21015623211860657, "learning_rate": 3.4552927677993593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6835 }, { "completion_length": 483.75, "epoch": 1.8946784922394677, "grad_norm": 0.0, "kl": 0.17267532646656036, "learning_rate": 3.454888183830284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6836 }, { "completion_length": 508.25, "epoch": 1.8949556541019956, "grad_norm": 0.0, "kl": 0.20172041654586792, "learning_rate": 3.4544835705787603e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6837 }, { "completion_length": 571.0, "epoch": 1.8952328159645233, "grad_norm": 0.0, "kl": 0.19281341135501862, "learning_rate": 3.4540789280571962e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6838 }, { "completion_length": 543.75, "epoch": 1.895509977827051, "grad_norm": 0.4303217828273773, "kl": 575766462464.0, "learning_rate": 3.4536742562780003e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6839 }, { "completion_length": 494.75, "epoch": 1.8957871396895787, "grad_norm": 0.0, "kl": 0.19728386402130127, "learning_rate": 3.4532695552535816e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6840 }, { "completion_length": 528.0, "epoch": 1.8960643015521064, "grad_norm": 0.0, "kl": 3353903890432.0, "learning_rate": 3.452864824996352e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6841 }, { "completion_length": 511.75, "epoch": 1.8963414634146343, "grad_norm": 0.0, "kl": 0.19502168893814087, "learning_rate": 3.452460065518721e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6842 }, { "completion_length": 545.25, "epoch": 1.8966186252771617, "grad_norm": 0.0, "kl": 0.22144855558872223, "learning_rate": 3.452055276833103e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6843 }, { "completion_length": 453.5, "epoch": 1.8968957871396896, "grad_norm": 0.0, "kl": 0.24653714895248413, "learning_rate": 3.4516504589519096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6844 }, { "completion_length": 441.75, "epoch": 1.8971729490022173, "grad_norm": 0.0, "kl": 12.243403434753418, "learning_rate": 3.4512456118875555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6845 }, { "completion_length": 460.5, "epoch": 1.897450110864745, "grad_norm": 0.0, "kl": 0.18014591932296753, "learning_rate": 3.450840735652456e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6846 }, { "completion_length": 509.75, "epoch": 1.8977272727272727, "grad_norm": 0.0, "kl": 0.3019845187664032, "learning_rate": 3.450435830259025e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6847 }, { "completion_length": 586.0, "epoch": 1.8980044345898004, "grad_norm": 0.0, "kl": 0.17585790157318115, "learning_rate": 3.4500308957196828e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6848 }, { "completion_length": 540.25, "epoch": 1.8982815964523283, "grad_norm": 0.0, "kl": 13.54255199432373, "learning_rate": 3.449625932046844e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6849 }, { "completion_length": 420.0, "epoch": 1.8985587583148558, "grad_norm": 0.0, "kl": 0.2888088524341583, "learning_rate": 3.4492209392529285e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6850 }, { "completion_length": 578.75, "epoch": 1.8988359201773837, "grad_norm": 0.0, "kl": 0.28345125913619995, "learning_rate": 3.448815917350355e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6851 }, { "completion_length": 505.75, "epoch": 1.8991130820399114, "grad_norm": 0.0, "kl": 0.23413150012493134, "learning_rate": 3.4484108663515446e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6852 }, { "completion_length": 490.25, "epoch": 1.899390243902439, "grad_norm": 0.0, "kl": 0.5259062647819519, "learning_rate": 3.448005786268918e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6853 }, { "completion_length": 498.5, "epoch": 1.8996674057649667, "grad_norm": 0.0, "kl": 0.27920717000961304, "learning_rate": 3.447600677114898e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6854 }, { "completion_length": 527.5, "epoch": 1.8999445676274944, "grad_norm": 0.0, "kl": 0.18256862461566925, "learning_rate": 3.447195538901906e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6855 }, { "completion_length": 418.0, "epoch": 1.9002217294900223, "grad_norm": 0.0, "kl": 0.2436179369688034, "learning_rate": 3.446790371642369e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6856 }, { "completion_length": 549.75, "epoch": 1.9004988913525498, "grad_norm": 0.0, "kl": 0.24145427346229553, "learning_rate": 3.446385175348709e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6857 }, { "completion_length": 562.0, "epoch": 1.9007760532150777, "grad_norm": 0.0, "kl": 0.17197558283805847, "learning_rate": 3.445979950033351e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6858 }, { "completion_length": 512.25, "epoch": 1.9010532150776052, "grad_norm": 0.0, "kl": 0.23865191638469696, "learning_rate": 3.4455746957087255e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6859 }, { "completion_length": 514.25, "epoch": 1.901330376940133, "grad_norm": 0.0, "kl": 0.1826101541519165, "learning_rate": 3.4451694123872563e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6860 }, { "completion_length": 540.5, "epoch": 1.9016075388026608, "grad_norm": 0.0, "kl": 0.2186620980501175, "learning_rate": 3.4447641000813734e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6861 }, { "completion_length": 547.0, "epoch": 1.9018847006651884, "grad_norm": 0.0, "kl": 0.2452690601348877, "learning_rate": 3.444358758803505e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6862 }, { "completion_length": 504.25, "epoch": 1.9021618625277164, "grad_norm": 0.0, "kl": 0.2063581645488739, "learning_rate": 3.4439533885660826e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6863 }, { "completion_length": 501.0, "epoch": 1.9024390243902438, "grad_norm": 0.0, "kl": 0.19415533542633057, "learning_rate": 3.443547989381536e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6864 }, { "completion_length": 412.0, "epoch": 1.9027161862527717, "grad_norm": 0.0, "kl": 0.20095176994800568, "learning_rate": 3.443142561262299e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6865 }, { "completion_length": 427.75, "epoch": 1.9029933481152992, "grad_norm": 0.0, "kl": 0.21761839091777802, "learning_rate": 3.442737104220801e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6866 }, { "completion_length": 454.0, "epoch": 1.903270509977827, "grad_norm": 0.0, "kl": 0.3716268241405487, "learning_rate": 3.4423316182694795e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6867 }, { "completion_length": 502.5, "epoch": 1.9035476718403548, "grad_norm": 0.0, "kl": 21747.87890625, "learning_rate": 3.441926103420766e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6868 }, { "completion_length": 566.5, "epoch": 1.9038248337028825, "grad_norm": 0.0, "kl": 915696320512.0, "learning_rate": 3.4415205596870976e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6869 }, { "completion_length": 526.5, "epoch": 1.9041019955654102, "grad_norm": 0.46806761622428894, "kl": 0.21813875436782837, "learning_rate": 3.44111498708091e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6870 }, { "completion_length": 540.25, "epoch": 1.9043791574279378, "grad_norm": 0.0, "kl": 0.3133074641227722, "learning_rate": 3.440709385614642e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6871 }, { "completion_length": 582.75, "epoch": 1.9046563192904657, "grad_norm": 3.8214166164398193, "kl": 7166408785920.0, "learning_rate": 3.440303755300729e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6872 }, { "completion_length": 491.75, "epoch": 1.9049334811529932, "grad_norm": 0.0, "kl": 0.20029714703559875, "learning_rate": 3.4398980961516114e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6873 }, { "completion_length": 507.0, "epoch": 1.9052106430155211, "grad_norm": 0.0, "kl": 0.1688329577445984, "learning_rate": 3.4394924081797296e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6874 }, { "completion_length": 580.75, "epoch": 1.9054878048780488, "grad_norm": 0.0, "kl": 0.15168744325637817, "learning_rate": 3.439086691397524e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6875 }, { "completion_length": 555.0, "epoch": 1.9057649667405765, "grad_norm": 0.0, "kl": 0.1955231875181198, "learning_rate": 3.4386809458174364e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6876 }, { "completion_length": 490.25, "epoch": 1.9060421286031042, "grad_norm": 0.0, "kl": 0.19434085488319397, "learning_rate": 3.4382751714519073e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6877 }, { "completion_length": 496.5, "epoch": 1.9063192904656319, "grad_norm": 0.0, "kl": 0.3148242235183716, "learning_rate": 3.4378693683133836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6878 }, { "completion_length": 574.0, "epoch": 1.9065964523281598, "grad_norm": 0.0, "kl": 0.23817917704582214, "learning_rate": 3.437463536414307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6879 }, { "completion_length": 541.0, "epoch": 1.9068736141906872, "grad_norm": 0.0, "kl": 0.1899229735136032, "learning_rate": 3.437057675767124e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6880 }, { "completion_length": 516.5, "epoch": 1.9071507760532151, "grad_norm": 0.0, "kl": 0.17092934250831604, "learning_rate": 3.4366517863842797e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6881 }, { "completion_length": 569.5, "epoch": 1.9074279379157428, "grad_norm": 0.0, "kl": 0.18635053932666779, "learning_rate": 3.4362458682782217e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6882 }, { "completion_length": 606.75, "epoch": 1.9077050997782705, "grad_norm": 0.0, "kl": 0.23142001032829285, "learning_rate": 3.4358399214613976e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6883 }, { "completion_length": 557.0, "epoch": 1.9079822616407982, "grad_norm": 0.0, "kl": 0.22193200886249542, "learning_rate": 3.4354339459462556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6884 }, { "completion_length": 495.75, "epoch": 1.908259423503326, "grad_norm": 0.0, "kl": 9830576160768.0, "learning_rate": 3.435027941745246e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6885 }, { "completion_length": 507.75, "epoch": 1.9085365853658538, "grad_norm": 0.60076504945755, "kl": 414852743168.0, "learning_rate": 3.4346219088708197e-06, "loss": 0.0, "reward": 1.625, "reward_std": 0.25, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6886 }, { "completion_length": 467.0, "epoch": 1.9088137472283813, "grad_norm": 0.5244084596633911, "kl": 27389430071296.0, "learning_rate": 3.4342158473354264e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6887 }, { "completion_length": 564.25, "epoch": 1.9090909090909092, "grad_norm": 0.0, "kl": 0.19712212681770325, "learning_rate": 3.4338097571515204e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6888 }, { "completion_length": 502.5, "epoch": 1.9093680709534369, "grad_norm": 0.0, "kl": 0.251451700925827, "learning_rate": 3.4334036383315527e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6889 }, { "completion_length": 572.25, "epoch": 1.9096452328159645, "grad_norm": 0.0, "kl": 0.2057259976863861, "learning_rate": 3.432997490887979e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6890 }, { "completion_length": 552.75, "epoch": 1.9099223946784922, "grad_norm": 0.0, "kl": 0.18333058059215546, "learning_rate": 3.4325913148332534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6891 }, { "completion_length": 514.5, "epoch": 1.91019955654102, "grad_norm": 0.0, "kl": 0.24728509783744812, "learning_rate": 3.4321851101798305e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6892 }, { "completion_length": 487.25, "epoch": 1.9104767184035478, "grad_norm": 0.0, "kl": 0.21439646184444427, "learning_rate": 3.4317788769401692e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6893 }, { "completion_length": 511.0, "epoch": 1.9107538802660753, "grad_norm": 0.0, "kl": 0.23890411853790283, "learning_rate": 3.4313726151267256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6894 }, { "completion_length": 421.0, "epoch": 1.9110310421286032, "grad_norm": 0.0, "kl": 0.24533814191818237, "learning_rate": 3.4309663247519587e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6895 }, { "completion_length": 463.0, "epoch": 1.9113082039911307, "grad_norm": 0.0, "kl": 0.25250595808029175, "learning_rate": 3.4305600058283265e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6896 }, { "completion_length": 577.0, "epoch": 1.9115853658536586, "grad_norm": 0.0, "kl": 1890867216384.0, "learning_rate": 3.430153658368291e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6897 }, { "completion_length": 590.25, "epoch": 1.9118625277161863, "grad_norm": 0.0, "kl": 0.21529518067836761, "learning_rate": 3.4297472823843113e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6898 }, { "completion_length": 592.5, "epoch": 1.912139689578714, "grad_norm": 0.0, "kl": 0.2769835889339447, "learning_rate": 3.4293408778888504e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6899 }, { "completion_length": 528.5, "epoch": 1.9124168514412418, "grad_norm": 0.0, "kl": 0.1951468586921692, "learning_rate": 3.42893444489437e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6900 }, { "completion_length": 614.0, "epoch": 1.9126940133037693, "grad_norm": 0.0, "kl": 0.191062331199646, "learning_rate": 3.4285279834133354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6901 }, { "completion_length": 500.5, "epoch": 1.9129711751662972, "grad_norm": 0.0, "kl": 0.22225232422351837, "learning_rate": 3.42812149345821e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6902 }, { "completion_length": 532.0, "epoch": 1.9132483370288247, "grad_norm": 0.0, "kl": 0.2415698617696762, "learning_rate": 3.4277149750414597e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6903 }, { "completion_length": 526.5, "epoch": 1.9135254988913526, "grad_norm": 0.0, "kl": 0.19902975857257843, "learning_rate": 3.427308428175549e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6904 }, { "completion_length": 544.5, "epoch": 1.9138026607538803, "grad_norm": 0.0, "kl": 0.1937248557806015, "learning_rate": 3.4269018528729468e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6905 }, { "completion_length": 534.25, "epoch": 1.914079822616408, "grad_norm": 0.0, "kl": 2.194115400314331, "learning_rate": 3.4264952491461214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6906 }, { "completion_length": 584.0, "epoch": 1.9143569844789357, "grad_norm": 0.0, "kl": 0.19894260168075562, "learning_rate": 3.42608861700754e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6907 }, { "completion_length": 503.75, "epoch": 1.9146341463414633, "grad_norm": 0.0, "kl": 0.20717072486877441, "learning_rate": 3.4256819564696737e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6908 }, { "completion_length": 547.0, "epoch": 1.9149113082039912, "grad_norm": 1.4650018215179443, "kl": 8896527360.0, "learning_rate": 3.425275267544992e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6909 }, { "completion_length": 581.5, "epoch": 1.9151884700665187, "grad_norm": 0.0, "kl": 0.18364499509334564, "learning_rate": 3.424868550245967e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6910 }, { "completion_length": 584.25, "epoch": 1.9154656319290466, "grad_norm": 0.655525267124176, "kl": 1382021726208.0, "learning_rate": 3.4244618045850714e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6911 }, { "completion_length": 427.0, "epoch": 1.9157427937915743, "grad_norm": 0.0, "kl": 0.2524467706680298, "learning_rate": 3.4240550305747776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6912 }, { "completion_length": 542.5, "epoch": 1.916019955654102, "grad_norm": 0.0, "kl": 511728.375, "learning_rate": 3.4236482282275602e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6913 }, { "completion_length": 523.0, "epoch": 1.9162971175166297, "grad_norm": 0.0, "kl": 200537.703125, "learning_rate": 3.423241397555893e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6914 }, { "completion_length": 576.75, "epoch": 1.9165742793791574, "grad_norm": 0.0, "kl": 0.18956471979618073, "learning_rate": 3.422834538572254e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6915 }, { "completion_length": 582.5, "epoch": 1.9168514412416853, "grad_norm": 0.0, "kl": 753.919677734375, "learning_rate": 3.422427651289118e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6916 }, { "completion_length": 640.5, "epoch": 1.9171286031042127, "grad_norm": 0.0, "kl": 243432.328125, "learning_rate": 3.422020735718963e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6917 }, { "completion_length": 609.5, "epoch": 1.9174057649667406, "grad_norm": 0.0, "kl": 0.19614854454994202, "learning_rate": 3.421613791874268e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6918 }, { "completion_length": 460.5, "epoch": 1.9176829268292683, "grad_norm": 0.0, "kl": 0.19740809500217438, "learning_rate": 3.421206819767511e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6919 }, { "completion_length": 594.0, "epoch": 1.917960088691796, "grad_norm": 0.0, "kl": 0.17348317801952362, "learning_rate": 3.4207998194111737e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6920 }, { "completion_length": 544.0, "epoch": 1.9182372505543237, "grad_norm": 0.0, "kl": 0.3183216452598572, "learning_rate": 3.4203927908177364e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6921 }, { "completion_length": 574.25, "epoch": 1.9185144124168514, "grad_norm": 0.0, "kl": 1391924215808.0, "learning_rate": 3.4199857339996807e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6922 }, { "completion_length": 541.5, "epoch": 1.9187915742793793, "grad_norm": 0.0, "kl": 0.2483709305524826, "learning_rate": 3.4195786489694892e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6923 }, { "completion_length": 626.25, "epoch": 1.9190687361419068, "grad_norm": 0.0, "kl": 0.1749916523694992, "learning_rate": 3.4191715357396465e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6924 }, { "completion_length": 541.75, "epoch": 1.9193458980044347, "grad_norm": 0.0, "kl": 0.23703980445861816, "learning_rate": 3.4187643943226365e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6925 }, { "completion_length": 582.25, "epoch": 1.9196230598669624, "grad_norm": 0.0, "kl": 0.16554521024227142, "learning_rate": 3.4183572247309437e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6926 }, { "completion_length": 636.0, "epoch": 1.91990022172949, "grad_norm": 0.0, "kl": 2767646720.0, "learning_rate": 3.4179500269770567e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6927 }, { "completion_length": 541.0, "epoch": 1.9201773835920177, "grad_norm": 0.0, "kl": 0.19182860851287842, "learning_rate": 3.417542801073459e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6928 }, { "completion_length": 484.5, "epoch": 1.9204545454545454, "grad_norm": 0.0, "kl": 1310937055232.0, "learning_rate": 3.417135547032642e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6929 }, { "completion_length": 605.25, "epoch": 1.9207317073170733, "grad_norm": 0.5056121349334717, "kl": 294709231616.0, "learning_rate": 3.416728264867092e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6930 }, { "completion_length": 508.0, "epoch": 1.9210088691796008, "grad_norm": 0.0, "kl": 0.287910133600235, "learning_rate": 3.4163209545893e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6931 }, { "completion_length": 514.0, "epoch": 1.9212860310421287, "grad_norm": 0.0, "kl": 0.28424981236457825, "learning_rate": 3.4159136162117557e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6932 }, { "completion_length": 534.0, "epoch": 1.9215631929046562, "grad_norm": 9.003520011901855, "kl": 0.34614598751068115, "learning_rate": 3.4155062497469516e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6933 }, { "completion_length": 544.75, "epoch": 1.921840354767184, "grad_norm": 0.0, "kl": 0.19967903196811676, "learning_rate": 3.415098855207379e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6934 }, { "completion_length": 562.75, "epoch": 1.9221175166297118, "grad_norm": 3.5492491722106934, "kl": 19862380.0, "learning_rate": 3.4146914326055315e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6935 }, { "completion_length": 586.75, "epoch": 1.9223946784922394, "grad_norm": 0.0, "kl": 0.18258777260780334, "learning_rate": 3.4142839819539013e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6936 }, { "completion_length": 572.75, "epoch": 1.9226718403547673, "grad_norm": 0.3953985869884491, "kl": 732895051776.0, "learning_rate": 3.4138765032649863e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6937 }, { "completion_length": 663.5, "epoch": 1.9229490022172948, "grad_norm": 0.0, "kl": 0.17704139649868011, "learning_rate": 3.41346899655128e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6938 }, { "completion_length": 452.5, "epoch": 1.9232261640798227, "grad_norm": 0.0, "kl": 0.30197709798812866, "learning_rate": 3.4130614618252793e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6939 }, { "completion_length": 516.5, "epoch": 1.9235033259423502, "grad_norm": 0.0, "kl": 0.28382033109664917, "learning_rate": 3.4126538990994824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6940 }, { "completion_length": 526.5, "epoch": 1.923780487804878, "grad_norm": 0.0, "kl": 10086046.0, "learning_rate": 3.4122463083863868e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6941 }, { "completion_length": 494.75, "epoch": 1.9240576496674058, "grad_norm": 0.0, "kl": 0.2246282547712326, "learning_rate": 3.411838689698492e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6942 }, { "completion_length": 552.75, "epoch": 1.9243348115299335, "grad_norm": 0.0, "kl": 0.19343096017837524, "learning_rate": 3.4114310430482967e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6943 }, { "completion_length": 600.5, "epoch": 1.9246119733924612, "grad_norm": 0.0, "kl": 0.17063191533088684, "learning_rate": 3.4110233684483033e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6944 }, { "completion_length": 633.5, "epoch": 1.9248891352549888, "grad_norm": 0.0, "kl": 0.19149528443813324, "learning_rate": 3.4106156659110127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6945 }, { "completion_length": 493.0, "epoch": 1.9251662971175167, "grad_norm": 0.0, "kl": 0.23320499062538147, "learning_rate": 3.410207935448929e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6946 }, { "completion_length": 491.0, "epoch": 1.9254434589800442, "grad_norm": 0.0, "kl": 0.22618749737739563, "learning_rate": 3.409800177074553e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6947 }, { "completion_length": 630.25, "epoch": 1.9257206208425721, "grad_norm": 0.0, "kl": 0.15154026448726654, "learning_rate": 3.409392390800391e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6948 }, { "completion_length": 625.0, "epoch": 1.9259977827050998, "grad_norm": 0.0, "kl": 0.21049819886684418, "learning_rate": 3.408984576638947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6949 }, { "completion_length": 570.0, "epoch": 1.9262749445676275, "grad_norm": 0.0, "kl": 0.22393137216567993, "learning_rate": 3.408576734602728e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6950 }, { "completion_length": 603.5, "epoch": 1.9265521064301552, "grad_norm": 0.0, "kl": 0.20287400484085083, "learning_rate": 3.4081688647042384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6951 }, { "completion_length": 513.75, "epoch": 1.9268292682926829, "grad_norm": 0.0, "kl": 0.18764813244342804, "learning_rate": 3.4077609669559888e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6952 }, { "completion_length": 483.75, "epoch": 1.9271064301552108, "grad_norm": 0.0, "kl": 0.2091650664806366, "learning_rate": 3.407353041370486e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6953 }, { "completion_length": 591.0, "epoch": 1.9273835920177382, "grad_norm": 0.0, "kl": 0.1602870672941208, "learning_rate": 3.4069450879602407e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6954 }, { "completion_length": 506.25, "epoch": 1.9276607538802661, "grad_norm": 0.0, "kl": 0.23144467175006866, "learning_rate": 3.406537106737762e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6955 }, { "completion_length": 513.0, "epoch": 1.9279379157427938, "grad_norm": 0.0, "kl": 0.1910376250743866, "learning_rate": 3.406129097715561e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6956 }, { "completion_length": 552.25, "epoch": 1.9282150776053215, "grad_norm": 0.0, "kl": 0.20724037289619446, "learning_rate": 3.4057210609061504e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6957 }, { "completion_length": 471.0, "epoch": 1.9284922394678492, "grad_norm": 0.0, "kl": 0.2121119201183319, "learning_rate": 3.4053129963220423e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6958 }, { "completion_length": 561.5, "epoch": 1.9287694013303769, "grad_norm": 0.0, "kl": 0.18637453019618988, "learning_rate": 3.4049049039757505e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6959 }, { "completion_length": 646.25, "epoch": 1.9290465631929048, "grad_norm": 0.0, "kl": 0.2093927562236786, "learning_rate": 3.4044967838797895e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6960 }, { "completion_length": 572.0, "epoch": 1.9293237250554323, "grad_norm": 0.5016438961029053, "kl": 2282547052544.0, "learning_rate": 3.4040886360466753e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6961 }, { "completion_length": 665.25, "epoch": 1.9296008869179602, "grad_norm": 0.0, "kl": 0.17139926552772522, "learning_rate": 3.4036804604889235e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6962 }, { "completion_length": 578.25, "epoch": 1.9298780487804879, "grad_norm": 0.0, "kl": 0.1644650250673294, "learning_rate": 3.4032722572190513e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6963 }, { "completion_length": 534.75, "epoch": 1.9301552106430155, "grad_norm": 0.0, "kl": 0.233192577958107, "learning_rate": 3.402864026249576e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6964 }, { "completion_length": 481.25, "epoch": 1.9304323725055432, "grad_norm": 0.0, "kl": 0.17830730974674225, "learning_rate": 3.4024557675930177e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6965 }, { "completion_length": 743.5, "epoch": 1.930709534368071, "grad_norm": 0.0, "kl": 2243190.0, "learning_rate": 3.4020474812618932e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6966 }, { "completion_length": 544.0, "epoch": 1.9309866962305988, "grad_norm": 0.0, "kl": 0.19637291133403778, "learning_rate": 3.4016391672687268e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6967 }, { "completion_length": 541.25, "epoch": 1.9312638580931263, "grad_norm": 0.0, "kl": 0.23556841909885406, "learning_rate": 3.4012308256260366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6968 }, { "completion_length": 587.0, "epoch": 1.9315410199556542, "grad_norm": 0.0, "kl": 0.21305571496486664, "learning_rate": 3.400822456346348e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6969 }, { "completion_length": 516.75, "epoch": 1.9318181818181817, "grad_norm": 0.0, "kl": 3883646464.0, "learning_rate": 3.4004140594421795e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6970 }, { "completion_length": 602.25, "epoch": 1.9320953436807096, "grad_norm": 0.0, "kl": 0.32934412360191345, "learning_rate": 3.4000056349260585e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6971 }, { "completion_length": 643.5, "epoch": 1.9323725055432373, "grad_norm": 1.5909274816513062, "kl": 647729.6875, "learning_rate": 3.399597182810509e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6972 }, { "completion_length": 496.0, "epoch": 1.932649667405765, "grad_norm": 0.0, "kl": 0.1852095127105713, "learning_rate": 3.3991887031080558e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6973 }, { "completion_length": 679.0, "epoch": 1.9329268292682928, "grad_norm": 0.0, "kl": 204435767296.0, "learning_rate": 3.3987801958312254e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6974 }, { "completion_length": 571.75, "epoch": 1.9332039911308203, "grad_norm": 0.0, "kl": 0.16828478872776031, "learning_rate": 3.3983716609925456e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6975 }, { "completion_length": 536.5, "epoch": 1.9334811529933482, "grad_norm": 0.0, "kl": 290398142464.0, "learning_rate": 3.3979630986045436e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6976 }, { "completion_length": 531.0, "epoch": 1.9337583148558757, "grad_norm": 0.7589097619056702, "kl": 13497.6103515625, "learning_rate": 3.397554508679749e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6977 }, { "completion_length": 551.5, "epoch": 1.9340354767184036, "grad_norm": 0.0, "kl": 0.21377255022525787, "learning_rate": 3.397145891230692e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6978 }, { "completion_length": 451.25, "epoch": 1.9343126385809313, "grad_norm": 0.0, "kl": 0.1885647475719452, "learning_rate": 3.3967372462699018e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6979 }, { "completion_length": 482.75, "epoch": 1.934589800443459, "grad_norm": 0.0, "kl": 0.20055317878723145, "learning_rate": 3.396328573809911e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6980 }, { "completion_length": 661.5, "epoch": 1.9348669623059866, "grad_norm": 0.0, "kl": 0.1720636636018753, "learning_rate": 3.39591987386325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6981 }, { "completion_length": 582.0, "epoch": 1.9351441241685143, "grad_norm": 0.0, "kl": 0.21372832357883453, "learning_rate": 3.3955111464424556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6982 }, { "completion_length": 518.25, "epoch": 1.9354212860310422, "grad_norm": 0.0, "kl": 0.17981769144535065, "learning_rate": 3.395102391560059e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6983 }, { "completion_length": 463.25, "epoch": 1.9356984478935697, "grad_norm": 0.0, "kl": 8297286656.0, "learning_rate": 3.3946936092285954e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6984 }, { "completion_length": 591.25, "epoch": 1.9359756097560976, "grad_norm": 0.0, "kl": 0.17620322108268738, "learning_rate": 3.3942847994606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6985 }, { "completion_length": 461.25, "epoch": 1.9362527716186253, "grad_norm": 0.0, "kl": 0.25778961181640625, "learning_rate": 3.3938759622686107e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6986 }, { "completion_length": 615.25, "epoch": 1.936529933481153, "grad_norm": 0.0, "kl": 0.16485293209552765, "learning_rate": 3.393467097665164e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6987 }, { "completion_length": 486.25, "epoch": 1.9368070953436807, "grad_norm": 0.0, "kl": 0.2833409607410431, "learning_rate": 3.3930582056627974e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6988 }, { "completion_length": 507.0, "epoch": 1.9370842572062084, "grad_norm": 0.0, "kl": 0.3719347417354584, "learning_rate": 3.3926492862740514e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6989 }, { "completion_length": 551.5, "epoch": 1.9373614190687363, "grad_norm": 0.0, "kl": 0.16074761748313904, "learning_rate": 3.3922403395114645e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6990 }, { "completion_length": 577.0, "epoch": 1.9376385809312637, "grad_norm": 1.7880363464355469, "kl": 48348656.0, "learning_rate": 3.391831365387579e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6991 }, { "completion_length": 526.5, "epoch": 1.9379157427937916, "grad_norm": 0.0, "kl": 0.2241332083940506, "learning_rate": 3.3914223639149347e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6992 }, { "completion_length": 681.0, "epoch": 1.9381929046563193, "grad_norm": 0.9928951859474182, "kl": 380283486208.0, "learning_rate": 3.3910133351060748e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6993 }, { "completion_length": 477.75, "epoch": 1.938470066518847, "grad_norm": 0.0, "kl": 0.18516592681407928, "learning_rate": 3.3906042789735427e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6994 }, { "completion_length": 527.75, "epoch": 1.9387472283813747, "grad_norm": 0.0, "kl": 0.2314903289079666, "learning_rate": 3.3901951955298824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6995 }, { "completion_length": 517.25, "epoch": 1.9390243902439024, "grad_norm": 0.0, "kl": 0.21582819521427155, "learning_rate": 3.3897860847876374e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6996 }, { "completion_length": 656.0, "epoch": 1.9393015521064303, "grad_norm": 0.0, "kl": 0.1735968440771103, "learning_rate": 3.389376946759356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6997 }, { "completion_length": 644.0, "epoch": 1.9395787139689578, "grad_norm": 0.0, "kl": 0.16014209389686584, "learning_rate": 3.388967781457582e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6998 }, { "completion_length": 541.25, "epoch": 1.9398558758314857, "grad_norm": 0.4379822015762329, "kl": 0.18979866802692413, "learning_rate": 3.388558588894865e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 6999 }, { "completion_length": 530.0, "epoch": 1.9401330376940134, "grad_norm": 0.0, "kl": 0.21931400895118713, "learning_rate": 3.3881493690837526e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7000 }, { "completion_length": 544.0, "epoch": 1.940410199556541, "grad_norm": 0.0, "kl": 0.16136068105697632, "learning_rate": 3.3877401220367933e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7001 }, { "completion_length": 490.0, "epoch": 1.9406873614190687, "grad_norm": 0.0, "kl": 0.29238584637641907, "learning_rate": 3.3873308477665372e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7002 }, { "completion_length": 500.25, "epoch": 1.9409645232815964, "grad_norm": 0.6995112895965576, "kl": 718460354560.0, "learning_rate": 3.3869215462855354e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7003 }, { "completion_length": 620.0, "epoch": 1.9412416851441243, "grad_norm": 0.0, "kl": 0.17728202044963837, "learning_rate": 3.386512217606339e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7004 }, { "completion_length": 572.75, "epoch": 1.9415188470066518, "grad_norm": 0.0, "kl": 0.19179394841194153, "learning_rate": 3.3861028617415013e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7005 }, { "completion_length": 529.25, "epoch": 1.9417960088691797, "grad_norm": 0.0, "kl": 0.20988832414150238, "learning_rate": 3.3856934787035743e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7006 }, { "completion_length": 539.75, "epoch": 1.9420731707317072, "grad_norm": 0.0, "kl": 0.18753886222839355, "learning_rate": 3.385284068505113e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7007 }, { "completion_length": 521.25, "epoch": 1.942350332594235, "grad_norm": 0.0, "kl": 0.17412765324115753, "learning_rate": 3.384874631158673e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7008 }, { "completion_length": 543.75, "epoch": 1.9426274944567627, "grad_norm": 0.0, "kl": 2682420.25, "learning_rate": 3.3844651666768073e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7009 }, { "completion_length": 520.75, "epoch": 1.9429046563192904, "grad_norm": 0.0, "kl": 0.21970297396183014, "learning_rate": 3.3840556750720755e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7010 }, { "completion_length": 524.75, "epoch": 1.9431818181818183, "grad_norm": 0.0, "kl": 0.2429463118314743, "learning_rate": 3.3836461563570322e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7011 }, { "completion_length": 498.75, "epoch": 1.9434589800443458, "grad_norm": 0.0, "kl": 0.17682349681854248, "learning_rate": 3.3832366105442387e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7012 }, { "completion_length": 497.0, "epoch": 1.9437361419068737, "grad_norm": 0.0, "kl": 0.2364681214094162, "learning_rate": 3.382827037646252e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7013 }, { "completion_length": 472.25, "epoch": 1.9440133037694012, "grad_norm": 0.0, "kl": 0.21650074422359467, "learning_rate": 3.382417437675633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7014 }, { "completion_length": 544.0, "epoch": 1.944290465631929, "grad_norm": 0.0, "kl": 0.3922140896320343, "learning_rate": 3.382007810644941e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7015 }, { "completion_length": 480.25, "epoch": 1.9445676274944568, "grad_norm": 0.0, "kl": 0.2459414154291153, "learning_rate": 3.381598156566739e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7016 }, { "completion_length": 450.25, "epoch": 1.9448447893569845, "grad_norm": 0.0, "kl": 0.18191467225551605, "learning_rate": 3.3811884754535897e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7017 }, { "completion_length": 543.25, "epoch": 1.9451219512195121, "grad_norm": 0.0, "kl": 0.26697254180908203, "learning_rate": 3.380778767318054e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7018 }, { "completion_length": 568.5, "epoch": 1.9453991130820398, "grad_norm": 0.0, "kl": 0.1893502026796341, "learning_rate": 3.380369032172698e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7019 }, { "completion_length": 485.75, "epoch": 1.9456762749445677, "grad_norm": 0.0, "kl": 0.1709754467010498, "learning_rate": 3.3799592700300867e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7020 }, { "completion_length": 497.0, "epoch": 1.9459534368070952, "grad_norm": 0.0, "kl": 0.23899470269680023, "learning_rate": 3.379549480902784e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7021 }, { "completion_length": 611.75, "epoch": 1.9462305986696231, "grad_norm": 0.0, "kl": 0.21086326241493225, "learning_rate": 3.379139664803358e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7022 }, { "completion_length": 545.0, "epoch": 1.9465077605321508, "grad_norm": 0.0, "kl": 0.18862202763557434, "learning_rate": 3.3787298217443764e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7023 }, { "completion_length": 673.75, "epoch": 1.9467849223946785, "grad_norm": 1.032253623008728, "kl": 11946.2333984375, "learning_rate": 3.3783199517384056e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7024 }, { "completion_length": 529.5, "epoch": 1.9470620842572062, "grad_norm": 0.0, "kl": 0.23638023436069489, "learning_rate": 3.3779100547980166e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7025 }, { "completion_length": 507.5, "epoch": 1.9473392461197339, "grad_norm": 0.0, "kl": 0.2943798303604126, "learning_rate": 3.3775001309357767e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7026 }, { "completion_length": 564.0, "epoch": 1.9476164079822618, "grad_norm": 0.0, "kl": 0.26059049367904663, "learning_rate": 3.37709018016426e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7027 }, { "completion_length": 588.75, "epoch": 1.9478935698447892, "grad_norm": 0.0, "kl": 0.19727735221385956, "learning_rate": 3.3766802024960345e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7028 }, { "completion_length": 489.0, "epoch": 1.9481707317073171, "grad_norm": 0.0, "kl": 0.22597555816173553, "learning_rate": 3.3762701979436747e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7029 }, { "completion_length": 574.75, "epoch": 1.9484478935698448, "grad_norm": 0.0, "kl": 59967.875, "learning_rate": 3.3758601665197533e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7030 }, { "completion_length": 448.25, "epoch": 1.9487250554323725, "grad_norm": 0.0, "kl": 0.8488590717315674, "learning_rate": 3.375450108236844e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7031 }, { "completion_length": 436.75, "epoch": 1.9490022172949002, "grad_norm": 0.0, "kl": 0.3814024329185486, "learning_rate": 3.3750400231075214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7032 }, { "completion_length": 514.0, "epoch": 1.9492793791574279, "grad_norm": 0.0, "kl": 0.18735437095165253, "learning_rate": 3.3746299111443616e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7033 }, { "completion_length": 549.5, "epoch": 1.9495565410199558, "grad_norm": 0.0, "kl": 0.15910179913043976, "learning_rate": 3.3742197723599403e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7034 }, { "completion_length": 514.75, "epoch": 1.9498337028824833, "grad_norm": 0.0, "kl": 0.20908844470977783, "learning_rate": 3.373809606766836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7035 }, { "completion_length": 513.5, "epoch": 1.9501108647450112, "grad_norm": 0.0, "kl": 639710656.0, "learning_rate": 3.373399414377625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7036 }, { "completion_length": 476.25, "epoch": 1.9503880266075388, "grad_norm": 0.0, "kl": 0.23185977339744568, "learning_rate": 3.3729891952048876e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7037 }, { "completion_length": 593.25, "epoch": 1.9506651884700665, "grad_norm": 0.0, "kl": 0.20054113864898682, "learning_rate": 3.3725789492612037e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7038 }, { "completion_length": 501.75, "epoch": 1.9509423503325942, "grad_norm": 0.0, "kl": 0.2378295511007309, "learning_rate": 3.372168676559153e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7039 }, { "completion_length": 492.25, "epoch": 1.951219512195122, "grad_norm": 0.0, "kl": 5.22808313369751, "learning_rate": 3.3717583771113177e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7040 }, { "completion_length": 545.75, "epoch": 1.9514966740576498, "grad_norm": 0.0, "kl": 0.18869177997112274, "learning_rate": 3.3713480509302775e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7041 }, { "completion_length": 542.5, "epoch": 1.9517738359201773, "grad_norm": 0.0, "kl": 0.2165433019399643, "learning_rate": 3.3709376980286195e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7042 }, { "completion_length": 440.75, "epoch": 1.9520509977827052, "grad_norm": 0.0, "kl": 0.25930383801460266, "learning_rate": 3.3705273184189242e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7043 }, { "completion_length": 483.5, "epoch": 1.9523281596452327, "grad_norm": 0.0, "kl": 0.22723214328289032, "learning_rate": 3.3701169121137778e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7044 }, { "completion_length": 497.5, "epoch": 1.9526053215077606, "grad_norm": 0.0, "kl": 207329.671875, "learning_rate": 3.369706479125765e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7045 }, { "completion_length": 507.25, "epoch": 1.9528824833702882, "grad_norm": 0.0, "kl": 1.2645208835601807, "learning_rate": 3.369296019467473e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7046 }, { "completion_length": 588.0, "epoch": 1.953159645232816, "grad_norm": 0.0, "kl": 0.183479443192482, "learning_rate": 3.3688855331514867e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7047 }, { "completion_length": 534.25, "epoch": 1.9534368070953438, "grad_norm": 0.0, "kl": 0.19791731238365173, "learning_rate": 3.368475020190397e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7048 }, { "completion_length": 513.25, "epoch": 1.9537139689578713, "grad_norm": 0.0, "kl": 0.1999455839395523, "learning_rate": 3.3680644805967905e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7049 }, { "completion_length": 434.25, "epoch": 1.9539911308203992, "grad_norm": 0.0, "kl": 0.17352400720119476, "learning_rate": 3.3676539143832577e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7050 }, { "completion_length": 683.75, "epoch": 1.9542682926829267, "grad_norm": 0.0, "kl": 0.1493086963891983, "learning_rate": 3.367243321562389e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7051 }, { "completion_length": 550.0, "epoch": 1.9545454545454546, "grad_norm": 0.0, "kl": 0.354470819234848, "learning_rate": 3.366832702146775e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7052 }, { "completion_length": 557.0, "epoch": 1.9548226164079823, "grad_norm": 0.0, "kl": 0.20564943552017212, "learning_rate": 3.366422056149008e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7053 }, { "completion_length": 510.0, "epoch": 1.95509977827051, "grad_norm": 0.0, "kl": 0.3529791831970215, "learning_rate": 3.36601138358168e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7054 }, { "completion_length": 680.75, "epoch": 1.9553769401330376, "grad_norm": 0.0, "kl": 0.16504931449890137, "learning_rate": 3.3656006844573863e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7055 }, { "completion_length": 559.25, "epoch": 1.9556541019955653, "grad_norm": 0.0, "kl": 0.1906619668006897, "learning_rate": 3.3651899587887204e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7056 }, { "completion_length": 483.5, "epoch": 1.9559312638580932, "grad_norm": 0.0, "kl": 0.19744502007961273, "learning_rate": 3.3647792065882768e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7057 }, { "completion_length": 440.75, "epoch": 1.9562084257206207, "grad_norm": 0.0, "kl": 0.2992638051509857, "learning_rate": 3.364368427868653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7058 }, { "completion_length": 781.75, "epoch": 1.9564855875831486, "grad_norm": 0.0, "kl": 0.14164476096630096, "learning_rate": 3.3639576226424454e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7059 }, { "completion_length": 516.75, "epoch": 1.9567627494456763, "grad_norm": 0.0, "kl": 0.20877115428447723, "learning_rate": 3.363546790922251e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7060 }, { "completion_length": 515.75, "epoch": 1.957039911308204, "grad_norm": 0.0, "kl": 0.32920369505882263, "learning_rate": 3.363135932720669e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7061 }, { "completion_length": 535.25, "epoch": 1.9573170731707317, "grad_norm": 0.494495689868927, "kl": 0.24637244641780853, "learning_rate": 3.3627250480502983e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7062 }, { "completion_length": 460.25, "epoch": 1.9575942350332594, "grad_norm": 0.0, "kl": 0.29718640446662903, "learning_rate": 3.3623141369237394e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7063 }, { "completion_length": 545.0, "epoch": 1.9578713968957873, "grad_norm": 0.0, "kl": 0.45796382427215576, "learning_rate": 3.361903199353593e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7064 }, { "completion_length": 548.75, "epoch": 1.9581485587583147, "grad_norm": 0.0, "kl": 0.19427843391895294, "learning_rate": 3.361492235352461e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7065 }, { "completion_length": 622.5, "epoch": 1.9584257206208426, "grad_norm": 0.0, "kl": 0.16736602783203125, "learning_rate": 3.3610812449329454e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7066 }, { "completion_length": 464.25, "epoch": 1.9587028824833703, "grad_norm": 0.0, "kl": 0.2511107921600342, "learning_rate": 3.360670228107651e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7067 }, { "completion_length": 499.0, "epoch": 1.958980044345898, "grad_norm": 0.0, "kl": 0.22632774710655212, "learning_rate": 3.36025918488918e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7068 }, { "completion_length": 603.25, "epoch": 1.9592572062084257, "grad_norm": 0.0, "kl": 14947307225088.0, "learning_rate": 3.359848115290139e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7069 }, { "completion_length": 557.75, "epoch": 1.9595343680709534, "grad_norm": 0.0, "kl": 0.1638771891593933, "learning_rate": 3.359437019323133e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7070 }, { "completion_length": 456.5, "epoch": 1.9598115299334813, "grad_norm": 0.0, "kl": 0.2198805958032608, "learning_rate": 3.359025897000769e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7071 }, { "completion_length": 625.25, "epoch": 1.9600886917960088, "grad_norm": 0.0, "kl": 0.1867382824420929, "learning_rate": 3.3586147483356534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7072 }, { "completion_length": 512.75, "epoch": 1.9603658536585367, "grad_norm": 0.0, "kl": 0.2895068824291229, "learning_rate": 3.3582035733403963e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7073 }, { "completion_length": 474.25, "epoch": 1.9606430155210643, "grad_norm": 0.0, "kl": 0.2602494955062866, "learning_rate": 3.357792372027605e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7074 }, { "completion_length": 534.25, "epoch": 1.960920177383592, "grad_norm": 0.0, "kl": 0.18525715172290802, "learning_rate": 3.3573811444098903e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7075 }, { "completion_length": 501.5, "epoch": 1.9611973392461197, "grad_norm": 0.0, "kl": 9.294727325439453, "learning_rate": 3.3569698904998626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7076 }, { "completion_length": 573.75, "epoch": 1.9614745011086474, "grad_norm": 0.0, "kl": 0.2768203616142273, "learning_rate": 3.356558610310133e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7077 }, { "completion_length": 548.0, "epoch": 1.9617516629711753, "grad_norm": 0.0, "kl": 0.20174184441566467, "learning_rate": 3.3561473038533134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7078 }, { "completion_length": 622.25, "epoch": 1.9620288248337028, "grad_norm": 0.0, "kl": 0.2163151651620865, "learning_rate": 3.355735971142018e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7079 }, { "completion_length": 464.25, "epoch": 1.9623059866962307, "grad_norm": 0.0, "kl": 0.2372342348098755, "learning_rate": 3.3553246121888606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7080 }, { "completion_length": 538.25, "epoch": 1.9625831485587582, "grad_norm": 0.0, "kl": 0.198662668466568, "learning_rate": 3.354913227006455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7081 }, { "completion_length": 538.25, "epoch": 1.962860310421286, "grad_norm": 0.0, "kl": 0.1992826759815216, "learning_rate": 3.3545018156074172e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7082 }, { "completion_length": 544.25, "epoch": 1.9631374722838137, "grad_norm": 0.0, "kl": 0.3269796371459961, "learning_rate": 3.3540903780043633e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7083 }, { "completion_length": 543.75, "epoch": 1.9634146341463414, "grad_norm": 0.0, "kl": 0.17882078886032104, "learning_rate": 3.35367891420991e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7084 }, { "completion_length": 521.5, "epoch": 1.9636917960088693, "grad_norm": 0.0, "kl": 0.19686143100261688, "learning_rate": 3.3532674242366764e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7085 }, { "completion_length": 590.0, "epoch": 1.9639689578713968, "grad_norm": 1.5734584331512451, "kl": 4818009587712.0, "learning_rate": 3.35285590809728e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7086 }, { "completion_length": 541.5, "epoch": 1.9642461197339247, "grad_norm": 0.684108316898346, "kl": 90278.375, "learning_rate": 3.3524443658043406e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7087 }, { "completion_length": 485.0, "epoch": 1.9645232815964522, "grad_norm": 0.0, "kl": 0.19492173194885254, "learning_rate": 3.352032797370478e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7088 }, { "completion_length": 528.0, "epoch": 1.96480044345898, "grad_norm": 0.0, "kl": 3249.42529296875, "learning_rate": 3.351621202808315e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7089 }, { "completion_length": 500.25, "epoch": 1.9650776053215078, "grad_norm": 0.4469601809978485, "kl": 189839.359375, "learning_rate": 3.351209582130472e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7090 }, { "completion_length": 575.5, "epoch": 1.9653547671840355, "grad_norm": 0.0, "kl": 0.19892001152038574, "learning_rate": 3.350797935349572e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7091 }, { "completion_length": 518.0, "epoch": 1.9656319290465631, "grad_norm": 0.0, "kl": 0.18805545568466187, "learning_rate": 3.3503862624782387e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7092 }, { "completion_length": 472.75, "epoch": 1.9659090909090908, "grad_norm": 0.0, "kl": 0.25036293268203735, "learning_rate": 3.349974563529096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7093 }, { "completion_length": 556.0, "epoch": 1.9661862527716187, "grad_norm": 0.0, "kl": 0.2416379451751709, "learning_rate": 3.349562838514769e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7094 }, { "completion_length": 546.25, "epoch": 1.9664634146341462, "grad_norm": 0.0, "kl": 0.20829224586486816, "learning_rate": 3.3491510874478845e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7095 }, { "completion_length": 598.5, "epoch": 1.966740576496674, "grad_norm": 0.0, "kl": 0.23690558969974518, "learning_rate": 3.3487393103410683e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7096 }, { "completion_length": 458.25, "epoch": 1.9670177383592018, "grad_norm": 0.0, "kl": 0.23141732811927795, "learning_rate": 3.348327507206948e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7097 }, { "completion_length": 490.0, "epoch": 1.9672949002217295, "grad_norm": 0.0, "kl": 0.41572684049606323, "learning_rate": 3.3479156780581523e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7098 }, { "completion_length": 559.0, "epoch": 1.9675720620842572, "grad_norm": 0.0, "kl": 83229936.0, "learning_rate": 3.3475038229073097e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7099 }, { "completion_length": 544.25, "epoch": 1.9678492239467849, "grad_norm": 0.0, "kl": 0.22856560349464417, "learning_rate": 3.34709194176705e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7100 }, { "completion_length": 537.0, "epoch": 1.9681263858093128, "grad_norm": 0.0, "kl": 0.1752365678548813, "learning_rate": 3.346680034650005e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7101 }, { "completion_length": 554.25, "epoch": 1.9684035476718402, "grad_norm": 0.0, "kl": 0.22164826095104218, "learning_rate": 3.346268101568805e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7102 }, { "completion_length": 653.5, "epoch": 1.9686807095343681, "grad_norm": 0.0, "kl": 0.20382976531982422, "learning_rate": 3.345856142536083e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7103 }, { "completion_length": 489.0, "epoch": 1.9689578713968958, "grad_norm": 0.0, "kl": 0.2454080432653427, "learning_rate": 3.3454441575644714e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7104 }, { "completion_length": 489.5, "epoch": 1.9692350332594235, "grad_norm": 0.0, "kl": 0.29343104362487793, "learning_rate": 3.345032146666605e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7105 }, { "completion_length": 563.75, "epoch": 1.9695121951219512, "grad_norm": 0.0, "kl": 0.24086175858974457, "learning_rate": 3.344620109855117e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7106 }, { "completion_length": 544.5, "epoch": 1.9697893569844789, "grad_norm": 0.0, "kl": 0.23755310475826263, "learning_rate": 3.3442080471426443e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7107 }, { "completion_length": 563.75, "epoch": 1.9700665188470068, "grad_norm": 0.0, "kl": 0.24301178753376007, "learning_rate": 3.343795958541823e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7108 }, { "completion_length": 564.75, "epoch": 1.9703436807095343, "grad_norm": 0.0, "kl": 0.21036823093891144, "learning_rate": 3.3433838440652888e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7109 }, { "completion_length": 480.0, "epoch": 1.9706208425720622, "grad_norm": 0.0, "kl": 0.30366238951683044, "learning_rate": 3.3429717037256815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7110 }, { "completion_length": 501.25, "epoch": 1.9708980044345898, "grad_norm": 0.0, "kl": 0.23658493161201477, "learning_rate": 3.342559537535638e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7111 }, { "completion_length": 419.75, "epoch": 1.9711751662971175, "grad_norm": 0.0, "kl": 0.2443622350692749, "learning_rate": 3.342147345507799e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7112 }, { "completion_length": 484.25, "epoch": 1.9714523281596452, "grad_norm": 0.7519316673278809, "kl": 4726.921875, "learning_rate": 3.3417351276548037e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7113 }, { "completion_length": 729.75, "epoch": 1.971729490022173, "grad_norm": 0.0, "kl": 0.18362365663051605, "learning_rate": 3.341322883989294e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7114 }, { "completion_length": 519.5, "epoch": 1.9720066518847008, "grad_norm": 0.0, "kl": 0.2017081379890442, "learning_rate": 3.3409106145239113e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7115 }, { "completion_length": 492.25, "epoch": 1.9722838137472283, "grad_norm": 0.0, "kl": 0.36555686593055725, "learning_rate": 3.3404983192712974e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7116 }, { "completion_length": 560.5, "epoch": 1.9725609756097562, "grad_norm": 0.0, "kl": 0.20199112594127655, "learning_rate": 3.340085998244097e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7117 }, { "completion_length": 507.75, "epoch": 1.9728381374722836, "grad_norm": 0.0, "kl": 0.25243067741394043, "learning_rate": 3.339673651454954e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7118 }, { "completion_length": 562.0, "epoch": 1.9731152993348116, "grad_norm": 0.0, "kl": 0.2557868957519531, "learning_rate": 3.3392612789165124e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7119 }, { "completion_length": 524.0, "epoch": 1.9733924611973392, "grad_norm": 0.0, "kl": 0.20389313995838165, "learning_rate": 3.3388488806414186e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7120 }, { "completion_length": 541.25, "epoch": 1.973669623059867, "grad_norm": 0.0, "kl": 0.2170882672071457, "learning_rate": 3.33843645664232e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7121 }, { "completion_length": 508.75, "epoch": 1.9739467849223948, "grad_norm": 0.0, "kl": 0.24742192029953003, "learning_rate": 3.3380240069318626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7122 }, { "completion_length": 485.0, "epoch": 1.9742239467849223, "grad_norm": 0.0, "kl": 0.28529852628707886, "learning_rate": 3.3376115315226943e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7123 }, { "completion_length": 565.75, "epoch": 1.9745011086474502, "grad_norm": 0.0, "kl": 0.16616366803646088, "learning_rate": 3.3371990304274654e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7124 }, { "completion_length": 534.75, "epoch": 1.9747782705099777, "grad_norm": 0.0, "kl": 0.20717376470565796, "learning_rate": 3.3367865036588254e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7125 }, { "completion_length": 511.25, "epoch": 1.9750554323725056, "grad_norm": 0.0, "kl": 0.20633439719676971, "learning_rate": 3.3363739512294235e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7126 }, { "completion_length": 537.25, "epoch": 1.9753325942350333, "grad_norm": 0.0, "kl": 0.22604548931121826, "learning_rate": 3.3359613731519127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7127 }, { "completion_length": 594.75, "epoch": 1.975609756097561, "grad_norm": 0.0, "kl": 0.18870031833648682, "learning_rate": 3.3355487694389425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7128 }, { "completion_length": 568.25, "epoch": 1.9758869179600886, "grad_norm": 0.0, "kl": 0.19518974423408508, "learning_rate": 3.335136140103169e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7129 }, { "completion_length": 502.25, "epoch": 1.9761640798226163, "grad_norm": 0.0, "kl": 0.2753055691719055, "learning_rate": 3.334723485157243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7130 }, { "completion_length": 536.75, "epoch": 1.9764412416851442, "grad_norm": 0.0, "kl": 0.1942216157913208, "learning_rate": 3.33431080461382e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7131 }, { "completion_length": 478.75, "epoch": 1.9767184035476717, "grad_norm": 0.0, "kl": 0.26838797330856323, "learning_rate": 3.3338980984855558e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7132 }, { "completion_length": 534.0, "epoch": 1.9769955654101996, "grad_norm": 0.9002754092216492, "kl": 4489871872.0, "learning_rate": 3.3334853667851064e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7133 }, { "completion_length": 637.75, "epoch": 1.9772727272727273, "grad_norm": 0.0, "kl": 54779.5546875, "learning_rate": 3.333072609525127e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7134 }, { "completion_length": 583.75, "epoch": 1.977549889135255, "grad_norm": 0.0, "kl": 0.1666506826877594, "learning_rate": 3.3326598267182764e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7135 }, { "completion_length": 456.25, "epoch": 1.9778270509977827, "grad_norm": 0.0, "kl": 0.2477436661720276, "learning_rate": 3.332247018377213e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7136 }, { "completion_length": 510.75, "epoch": 1.9781042128603104, "grad_norm": 0.0, "kl": 0.3114957809448242, "learning_rate": 3.3318341845145953e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7137 }, { "completion_length": 460.0, "epoch": 1.9783813747228383, "grad_norm": 0.0, "kl": 0.252315878868103, "learning_rate": 3.331421325143084e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7138 }, { "completion_length": 489.0, "epoch": 1.9786585365853657, "grad_norm": 0.0, "kl": 0.441959023475647, "learning_rate": 3.3310084402753383e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7139 }, { "completion_length": 543.0, "epoch": 1.9789356984478936, "grad_norm": 0.0, "kl": 0.2468504011631012, "learning_rate": 3.3305955299240216e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7140 }, { "completion_length": 608.0, "epoch": 1.9792128603104213, "grad_norm": 0.4438917338848114, "kl": 0.21434137225151062, "learning_rate": 3.3301825941017944e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7141 }, { "completion_length": 532.0, "epoch": 1.979490022172949, "grad_norm": 0.0, "kl": 4926289215488.0, "learning_rate": 3.3297696328213215e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7142 }, { "completion_length": 584.0, "epoch": 1.9797671840354767, "grad_norm": 0.0, "kl": 0.22978715598583221, "learning_rate": 3.3293566460952653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7143 }, { "completion_length": 515.25, "epoch": 1.9800443458980044, "grad_norm": 0.0, "kl": 0.3021978735923767, "learning_rate": 3.3289436339362905e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7144 }, { "completion_length": 539.0, "epoch": 1.9803215077605323, "grad_norm": 0.0, "kl": 0.24631084501743317, "learning_rate": 3.3285305963570625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7145 }, { "completion_length": 505.5, "epoch": 1.9805986696230597, "grad_norm": 0.0, "kl": 1.3092494010925293, "learning_rate": 3.3281175333702486e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7146 }, { "completion_length": 615.0, "epoch": 1.9808758314855877, "grad_norm": 0.0, "kl": 0.24133624136447906, "learning_rate": 3.3277044449885145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7147 }, { "completion_length": 640.75, "epoch": 1.9811529933481153, "grad_norm": 0.0, "kl": 0.2034279853105545, "learning_rate": 3.327291331224528e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7148 }, { "completion_length": 523.5, "epoch": 1.981430155210643, "grad_norm": 0.0, "kl": 0.2445344775915146, "learning_rate": 3.326878192090958e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7149 }, { "completion_length": 506.75, "epoch": 1.9817073170731707, "grad_norm": 0.0, "kl": 0.20636583864688873, "learning_rate": 3.326465027600474e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7150 }, { "completion_length": 502.5, "epoch": 1.9819844789356984, "grad_norm": 0.0, "kl": 353335.75, "learning_rate": 3.3260518377657448e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7151 }, { "completion_length": 523.0, "epoch": 1.9822616407982263, "grad_norm": 0.0, "kl": 0.5085856318473816, "learning_rate": 3.3256386225994424e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7152 }, { "completion_length": 658.25, "epoch": 1.9825388026607538, "grad_norm": 0.0, "kl": 0.19018737971782684, "learning_rate": 3.3252253821142375e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7153 }, { "completion_length": 544.75, "epoch": 1.9828159645232817, "grad_norm": 0.0, "kl": 0.16702918708324432, "learning_rate": 3.3248121163228037e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7154 }, { "completion_length": 606.0, "epoch": 1.9830931263858091, "grad_norm": 0.0, "kl": 0.2310035079717636, "learning_rate": 3.3243988252378136e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7155 }, { "completion_length": 574.25, "epoch": 1.983370288248337, "grad_norm": 0.0, "kl": 0.177884042263031, "learning_rate": 3.32398550887194e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7156 }, { "completion_length": 578.25, "epoch": 1.9836474501108647, "grad_norm": 0.0, "kl": 0.18217067420482635, "learning_rate": 3.3235721672378597e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7157 }, { "completion_length": 533.75, "epoch": 1.9839246119733924, "grad_norm": 0.0, "kl": 0.4579375088214874, "learning_rate": 3.3231588003482464e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7158 }, { "completion_length": 545.5, "epoch": 1.9842017738359203, "grad_norm": 0.0, "kl": 0.2411251664161682, "learning_rate": 3.3227454082157773e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7159 }, { "completion_length": 535.5, "epoch": 1.9844789356984478, "grad_norm": 0.0, "kl": 0.19904348254203796, "learning_rate": 3.3223319908531283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7160 }, { "completion_length": 509.75, "epoch": 1.9847560975609757, "grad_norm": 0.0, "kl": 0.22524119913578033, "learning_rate": 3.3219185482729782e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7161 }, { "completion_length": 563.0, "epoch": 1.9850332594235032, "grad_norm": 0.0, "kl": 1.4868909120559692, "learning_rate": 3.3215050804880054e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7162 }, { "completion_length": 510.5, "epoch": 1.985310421286031, "grad_norm": 0.0, "kl": 0.4695400297641754, "learning_rate": 3.3210915875108895e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7163 }, { "completion_length": 617.5, "epoch": 1.9855875831485588, "grad_norm": 0.431156188249588, "kl": 0.2729968726634979, "learning_rate": 3.32067806935431e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7164 }, { "completion_length": 552.0, "epoch": 1.9858647450110865, "grad_norm": 0.0, "kl": 0.29315173625946045, "learning_rate": 3.3202645260309485e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7165 }, { "completion_length": 582.75, "epoch": 1.9861419068736141, "grad_norm": 0.0, "kl": 1.1204171180725098, "learning_rate": 3.3198509575534853e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7166 }, { "completion_length": 534.75, "epoch": 1.9864190687361418, "grad_norm": 0.0, "kl": 0.23741206526756287, "learning_rate": 3.3194373639346043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7167 }, { "completion_length": 581.25, "epoch": 1.9866962305986697, "grad_norm": 0.0, "kl": 0.25144752860069275, "learning_rate": 3.319023745186988e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7168 }, { "completion_length": 596.25, "epoch": 1.9869733924611972, "grad_norm": 0.0, "kl": 0.20253272354602814, "learning_rate": 3.318610101323321e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7169 }, { "completion_length": 555.0, "epoch": 1.987250554323725, "grad_norm": 0.0, "kl": 0.1996677815914154, "learning_rate": 3.3181964323562866e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7170 }, { "completion_length": 568.0, "epoch": 1.9875277161862528, "grad_norm": 0.0, "kl": 0.2737431228160858, "learning_rate": 3.3177827382985716e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7171 }, { "completion_length": 620.0, "epoch": 1.9878048780487805, "grad_norm": 0.0, "kl": 0.21245642006397247, "learning_rate": 3.317369019162862e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7172 }, { "completion_length": 490.0, "epoch": 1.9880820399113082, "grad_norm": 0.0, "kl": 0.19388218224048615, "learning_rate": 3.3169552749618444e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7173 }, { "completion_length": 551.25, "epoch": 1.9883592017738358, "grad_norm": 1.388253092765808, "kl": 2945311571968.0, "learning_rate": 3.3165415057082077e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7174 }, { "completion_length": 507.25, "epoch": 1.9886363636363638, "grad_norm": 0.0, "kl": 0.1975248008966446, "learning_rate": 3.316127711414639e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7175 }, { "completion_length": 572.75, "epoch": 1.9889135254988912, "grad_norm": 0.0, "kl": 0.26423752307891846, "learning_rate": 3.315713892093829e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7176 }, { "completion_length": 549.25, "epoch": 1.9891906873614191, "grad_norm": 0.0, "kl": 0.22015628218650818, "learning_rate": 3.315300047758467e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7177 }, { "completion_length": 507.0, "epoch": 1.9894678492239468, "grad_norm": 0.0, "kl": 3.4914662837982178, "learning_rate": 3.3148861784212437e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7178 }, { "completion_length": 504.25, "epoch": 1.9897450110864745, "grad_norm": 0.0, "kl": 0.19427892565727234, "learning_rate": 3.3144722840948516e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7179 }, { "completion_length": 621.5, "epoch": 1.9900221729490022, "grad_norm": 0.0, "kl": 0.18407092988491058, "learning_rate": 3.3140583647919833e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7180 }, { "completion_length": 625.0, "epoch": 1.9902993348115299, "grad_norm": 0.0, "kl": 9451737186304.0, "learning_rate": 3.3136444205253304e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7181 }, { "completion_length": 660.75, "epoch": 1.9905764966740578, "grad_norm": 0.0, "kl": 0.24026858806610107, "learning_rate": 3.3132304513075877e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7182 }, { "completion_length": 476.25, "epoch": 1.9908536585365852, "grad_norm": 0.0, "kl": 0.24381671845912933, "learning_rate": 3.3128164571514496e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7183 }, { "completion_length": 541.75, "epoch": 1.9911308203991132, "grad_norm": 0.0, "kl": 94219878400.0, "learning_rate": 3.3124024380696134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7184 }, { "completion_length": 516.75, "epoch": 1.9914079822616408, "grad_norm": 0.0, "kl": 0.4235144257545471, "learning_rate": 3.3119883940747734e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7185 }, { "completion_length": 468.5, "epoch": 1.9916851441241685, "grad_norm": 0.0, "kl": 0.44935429096221924, "learning_rate": 3.3115743251796266e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7186 }, { "completion_length": 571.75, "epoch": 1.9919623059866962, "grad_norm": 0.0, "kl": 0.21954551339149475, "learning_rate": 3.311160231396872e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7187 }, { "completion_length": 618.5, "epoch": 1.992239467849224, "grad_norm": 0.0, "kl": 0.197567418217659, "learning_rate": 3.3107461127392072e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7188 }, { "completion_length": 551.75, "epoch": 1.9925166297117518, "grad_norm": 0.0, "kl": 0.20966370403766632, "learning_rate": 3.310331969219332e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7189 }, { "completion_length": 573.0, "epoch": 1.9927937915742793, "grad_norm": 0.0, "kl": 679540752384.0, "learning_rate": 3.309917800849945e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7190 }, { "completion_length": 574.25, "epoch": 1.9930709534368072, "grad_norm": 0.0, "kl": 0.17387302219867706, "learning_rate": 3.3095036076437496e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7191 }, { "completion_length": 527.75, "epoch": 1.9933481152993349, "grad_norm": 0.0, "kl": 0.17454689741134644, "learning_rate": 3.309089389613445e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7192 }, { "completion_length": 585.0, "epoch": 1.9936252771618626, "grad_norm": 0.45409756898880005, "kl": 0.16141264140605927, "learning_rate": 3.308675146771736e-06, "loss": 0.0, "reward": 4.625, "reward_std": 2.25, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7193 }, { "completion_length": 461.0, "epoch": 1.9939024390243902, "grad_norm": 0.0, "kl": 0.3439924716949463, "learning_rate": 3.308260879131323e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7194 }, { "completion_length": 526.75, "epoch": 1.994179600886918, "grad_norm": 0.0, "kl": 0.8774016499519348, "learning_rate": 3.3078465867049115e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7195 }, { "completion_length": 551.75, "epoch": 1.9944567627494458, "grad_norm": 0.0, "kl": 0.20093084871768951, "learning_rate": 3.307432269505206e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7196 }, { "completion_length": 549.5, "epoch": 1.9947339246119733, "grad_norm": 0.0, "kl": 0.17932432889938354, "learning_rate": 3.3070179275449117e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7197 }, { "completion_length": 576.25, "epoch": 1.9950110864745012, "grad_norm": 0.0, "kl": 0.19793596863746643, "learning_rate": 3.306603560836733e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7198 }, { "completion_length": 566.75, "epoch": 1.9952882483370287, "grad_norm": 0.0, "kl": 0.23053283989429474, "learning_rate": 3.3061891693933805e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7199 }, { "completion_length": 644.0, "epoch": 1.9955654101995566, "grad_norm": 0.0, "kl": 0.17510318756103516, "learning_rate": 3.3057747532275588e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7200 }, { "completion_length": 468.75, "epoch": 1.9958425720620843, "grad_norm": 0.0, "kl": 0.3619982898235321, "learning_rate": 3.3053603123519784e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7201 }, { "completion_length": 499.0, "epoch": 1.996119733924612, "grad_norm": 0.0, "kl": 0.31506627798080444, "learning_rate": 3.304945846779346e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7202 }, { "completion_length": 524.5, "epoch": 1.9963968957871396, "grad_norm": 0.0, "kl": 0.2690357267856598, "learning_rate": 3.3045313565223734e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7203 }, { "completion_length": 537.0, "epoch": 1.9966740576496673, "grad_norm": 0.0, "kl": 0.19389037787914276, "learning_rate": 3.304116841593772e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7204 }, { "completion_length": 541.0, "epoch": 1.9969512195121952, "grad_norm": 0.4750312864780426, "kl": 14238074535936.0, "learning_rate": 3.30370230200625e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7205 }, { "completion_length": 546.0, "epoch": 1.9972283813747227, "grad_norm": 0.0, "kl": 0.2569229006767273, "learning_rate": 3.3032877377725238e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7206 }, { "completion_length": 595.5, "epoch": 1.9975055432372506, "grad_norm": 0.0, "kl": 0.20442306995391846, "learning_rate": 3.3028731489053035e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7207 }, { "completion_length": 584.25, "epoch": 1.9977827050997783, "grad_norm": 0.0, "kl": 762792574976.0, "learning_rate": 3.3024585354173032e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7208 }, { "completion_length": 516.25, "epoch": 1.998059866962306, "grad_norm": 0.0, "kl": 0.21877454221248627, "learning_rate": 3.3020438973212383e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7209 }, { "completion_length": 530.5, "epoch": 1.9983370288248337, "grad_norm": 1.744369387626648, "kl": 762413514752.0, "learning_rate": 3.301629234629824e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7210 }, { "completion_length": 549.0, "epoch": 1.9986141906873613, "grad_norm": 0.0, "kl": 49010937167872.0, "learning_rate": 3.3012145473557744e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7211 }, { "completion_length": 482.25, "epoch": 1.9988913525498893, "grad_norm": 0.0, "kl": 0.28958672285079956, "learning_rate": 3.300799835511809e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7212 }, { "completion_length": 489.5, "epoch": 1.9991685144124167, "grad_norm": 0.0, "kl": 0.21519377827644348, "learning_rate": 3.3003850991106424e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7213 }, { "completion_length": 531.0, "epoch": 1.9994456762749446, "grad_norm": 0.0, "kl": 0.21414360404014587, "learning_rate": 3.299970338164995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7214 }, { "completion_length": 484.25, "epoch": 1.9997228381374723, "grad_norm": 0.0, "kl": 0.17113493382930756, "learning_rate": 3.2995555526875856e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7215 }, { "completion_length": 545.5, "epoch": 2.0, "grad_norm": 0.0, "kl": 51671723933696.0, "learning_rate": 3.2991407426911332e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7216 }, { "completion_length": 531.75, "epoch": 2.000277161862528, "grad_norm": 0.0, "kl": 0.24398557841777802, "learning_rate": 3.2987259081883583e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7217 }, { "completion_length": 632.25, "epoch": 2.0005543237250554, "grad_norm": 0.0, "kl": 0.17893953621387482, "learning_rate": 3.298311049191982e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7218 }, { "completion_length": 508.0, "epoch": 2.0008314855875833, "grad_norm": 0.0, "kl": 0.2698563039302826, "learning_rate": 3.297896165714728e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7219 }, { "completion_length": 638.75, "epoch": 2.0011086474501107, "grad_norm": 0.0, "kl": 4745002483712.0, "learning_rate": 3.2974812577693167e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7220 }, { "completion_length": 584.25, "epoch": 2.0013858093126387, "grad_norm": 0.0, "kl": 0.17083197832107544, "learning_rate": 3.2970663253684728e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7221 }, { "completion_length": 458.5, "epoch": 2.001662971175166, "grad_norm": 0.0, "kl": 0.20133563876152039, "learning_rate": 3.29665136852492e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7222 }, { "completion_length": 455.25, "epoch": 2.001940133037694, "grad_norm": 0.0, "kl": 0.27080434560775757, "learning_rate": 3.296236387251385e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7223 }, { "completion_length": 475.75, "epoch": 2.002217294900222, "grad_norm": 0.0, "kl": 0.24304009974002838, "learning_rate": 3.2958213815605916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7224 }, { "completion_length": 471.0, "epoch": 2.0024944567627494, "grad_norm": 0.0, "kl": 0.23815883696079254, "learning_rate": 3.2954063514652672e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7225 }, { "completion_length": 498.75, "epoch": 2.0027716186252773, "grad_norm": 0.0, "kl": 0.19948241114616394, "learning_rate": 3.2949912969781384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7226 }, { "completion_length": 481.5, "epoch": 2.0030487804878048, "grad_norm": 0.0, "kl": 0.20486634969711304, "learning_rate": 3.294576218111934e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7227 }, { "completion_length": 521.75, "epoch": 2.0033259423503327, "grad_norm": 0.0, "kl": 0.21761615574359894, "learning_rate": 3.294161114879382e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7228 }, { "completion_length": 481.25, "epoch": 2.00360310421286, "grad_norm": 0.0, "kl": 0.17130957543849945, "learning_rate": 3.293745987293212e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7229 }, { "completion_length": 486.75, "epoch": 2.003880266075388, "grad_norm": 0.0, "kl": 0.6347435116767883, "learning_rate": 3.2933308353661546e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7230 }, { "completion_length": 509.0, "epoch": 2.004157427937916, "grad_norm": 0.0, "kl": 0.7059570550918579, "learning_rate": 3.2929156591109414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7231 }, { "completion_length": 476.5, "epoch": 2.0044345898004434, "grad_norm": 0.0, "kl": 0.38684359192848206, "learning_rate": 3.2925004585403024e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7232 }, { "completion_length": 523.0, "epoch": 2.0047117516629713, "grad_norm": 0.0, "kl": 0.1920212060213089, "learning_rate": 3.292085233666972e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7233 }, { "completion_length": 588.0, "epoch": 2.004988913525499, "grad_norm": 0.0, "kl": 0.18836773931980133, "learning_rate": 3.291669984503682e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7234 }, { "completion_length": 634.5, "epoch": 2.0052660753880267, "grad_norm": 0.0, "kl": 0.14146435260772705, "learning_rate": 3.2912547110631665e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7235 }, { "completion_length": 503.5, "epoch": 2.005543237250554, "grad_norm": 0.0, "kl": 0.19812235236167908, "learning_rate": 3.2908394133581607e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7236 }, { "completion_length": 514.0, "epoch": 2.005820399113082, "grad_norm": 0.0, "kl": 0.34836772084236145, "learning_rate": 3.2904240914013997e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7237 }, { "completion_length": 534.5, "epoch": 2.0060975609756095, "grad_norm": 0.0, "kl": 0.165089949965477, "learning_rate": 3.2900087452056208e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7238 }, { "completion_length": 582.25, "epoch": 2.0063747228381374, "grad_norm": 0.0, "kl": 0.29044345021247864, "learning_rate": 3.2895933747835594e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7239 }, { "completion_length": 544.25, "epoch": 2.0066518847006654, "grad_norm": 0.0, "kl": 0.18360665440559387, "learning_rate": 3.289177980147954e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7240 }, { "completion_length": 654.75, "epoch": 2.006929046563193, "grad_norm": 0.0, "kl": 0.16136503219604492, "learning_rate": 3.2887625613115427e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7241 }, { "completion_length": 576.75, "epoch": 2.0072062084257207, "grad_norm": 0.0, "kl": 77554065080320.0, "learning_rate": 3.2883471182870645e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7242 }, { "completion_length": 436.0, "epoch": 2.007483370288248, "grad_norm": 0.0, "kl": 0.20903617143630981, "learning_rate": 3.28793165108726e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7243 }, { "completion_length": 568.75, "epoch": 2.007760532150776, "grad_norm": 0.0, "kl": 0.8012579679489136, "learning_rate": 3.28751615972487e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7244 }, { "completion_length": 532.25, "epoch": 2.0080376940133036, "grad_norm": 0.0, "kl": 0.6828905940055847, "learning_rate": 3.2871006442126345e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7245 }, { "completion_length": 713.5, "epoch": 2.0083148558758315, "grad_norm": 0.0, "kl": 1013320253440.0, "learning_rate": 3.2866851045632976e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7246 }, { "completion_length": 561.5, "epoch": 2.0085920177383594, "grad_norm": 0.0, "kl": 0.18262358009815216, "learning_rate": 3.2862695407896e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7247 }, { "completion_length": 570.75, "epoch": 2.008869179600887, "grad_norm": 0.0, "kl": 0.2661609649658203, "learning_rate": 3.2858539529042873e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7248 }, { "completion_length": 482.5, "epoch": 2.0091463414634148, "grad_norm": 0.0, "kl": 0.18760767579078674, "learning_rate": 3.285438340920102e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7249 }, { "completion_length": 495.75, "epoch": 2.009423503325942, "grad_norm": 0.0, "kl": 0.22082363069057465, "learning_rate": 3.28502270484979e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7250 }, { "completion_length": 485.0, "epoch": 2.00970066518847, "grad_norm": 0.0, "kl": 0.2159932404756546, "learning_rate": 3.284607044706098e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7251 }, { "completion_length": 611.0, "epoch": 2.0099778270509976, "grad_norm": 0.0, "kl": 0.2064027339220047, "learning_rate": 3.284191360501771e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7252 }, { "completion_length": 527.5, "epoch": 2.0102549889135255, "grad_norm": 0.0, "kl": 0.22757728397846222, "learning_rate": 3.283775652249558e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7253 }, { "completion_length": 460.25, "epoch": 2.0105321507760534, "grad_norm": 0.0, "kl": 0.19826321303844452, "learning_rate": 3.283359919962206e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7254 }, { "completion_length": 470.25, "epoch": 2.010809312638581, "grad_norm": 0.0, "kl": 0.19292183220386505, "learning_rate": 3.282944163652464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7255 }, { "completion_length": 530.25, "epoch": 2.011086474501109, "grad_norm": 0.0, "kl": 0.2548483610153198, "learning_rate": 3.2825283833330814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7256 }, { "completion_length": 528.5, "epoch": 2.0113636363636362, "grad_norm": 0.0, "kl": 1.8123424053192139, "learning_rate": 3.282112579016808e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7257 }, { "completion_length": 522.25, "epoch": 2.011640798226164, "grad_norm": 0.0, "kl": 0.24267376959323883, "learning_rate": 3.2816967507163956e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7258 }, { "completion_length": 557.5, "epoch": 2.0119179600886916, "grad_norm": 0.0, "kl": 0.23885062336921692, "learning_rate": 3.281280898444596e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7259 }, { "completion_length": 691.25, "epoch": 2.0121951219512195, "grad_norm": 0.0, "kl": 0.1626874804496765, "learning_rate": 3.280865022214161e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7260 }, { "completion_length": 430.0, "epoch": 2.0124722838137474, "grad_norm": 0.0, "kl": 0.1905716508626938, "learning_rate": 3.2804491220378442e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7261 }, { "completion_length": 554.75, "epoch": 2.012749445676275, "grad_norm": 0.0, "kl": 0.395070880651474, "learning_rate": 3.2800331979283993e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7262 }, { "completion_length": 459.25, "epoch": 2.013026607538803, "grad_norm": 0.0, "kl": 0.387269526720047, "learning_rate": 3.2796172498985814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7263 }, { "completion_length": 568.25, "epoch": 2.0133037694013303, "grad_norm": 0.0, "kl": 0.23120565712451935, "learning_rate": 3.279201277961145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7264 }, { "completion_length": 621.5, "epoch": 2.013580931263858, "grad_norm": 0.646463930606842, "kl": 5649874812928.0, "learning_rate": 3.278785282128847e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7265 }, { "completion_length": 552.0, "epoch": 2.0138580931263856, "grad_norm": 0.0, "kl": 0.1464119255542755, "learning_rate": 3.278369262414444e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7266 }, { "completion_length": 540.25, "epoch": 2.0141352549889135, "grad_norm": 0.0, "kl": 0.24897807836532593, "learning_rate": 3.277953218830694e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7267 }, { "completion_length": 563.0, "epoch": 2.0144124168514415, "grad_norm": 0.0, "kl": 0.23546026647090912, "learning_rate": 3.277537151390355e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7268 }, { "completion_length": 504.25, "epoch": 2.014689578713969, "grad_norm": 0.0, "kl": 0.18537332117557526, "learning_rate": 3.2771210601061854e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7269 }, { "completion_length": 504.25, "epoch": 2.014966740576497, "grad_norm": 0.0, "kl": 0.22745323181152344, "learning_rate": 3.276704944990946e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7270 }, { "completion_length": 614.5, "epoch": 2.0152439024390243, "grad_norm": 0.0, "kl": 0.24344609677791595, "learning_rate": 3.276288806057397e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7271 }, { "completion_length": 449.75, "epoch": 2.015521064301552, "grad_norm": 0.0, "kl": 0.2798785865306854, "learning_rate": 3.2758726433182995e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7272 }, { "completion_length": 516.75, "epoch": 2.0157982261640797, "grad_norm": 0.0, "kl": 0.20575881004333496, "learning_rate": 3.2754564567864157e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7273 }, { "completion_length": 483.0, "epoch": 2.0160753880266076, "grad_norm": 0.0, "kl": 0.21659162640571594, "learning_rate": 3.2750402464745084e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7274 }, { "completion_length": 507.5, "epoch": 2.016352549889135, "grad_norm": 0.0, "kl": 0.2162400186061859, "learning_rate": 3.2746240123953406e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7275 }, { "completion_length": 528.0, "epoch": 2.016629711751663, "grad_norm": 0.0, "kl": 0.18717734515666962, "learning_rate": 3.2742077545616768e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7276 }, { "completion_length": 510.0, "epoch": 2.016906873614191, "grad_norm": 0.0, "kl": 0.20315125584602356, "learning_rate": 3.2737914729862817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7277 }, { "completion_length": 592.5, "epoch": 2.0171840354767183, "grad_norm": 0.0, "kl": 0.20825247466564178, "learning_rate": 3.2733751676819213e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7278 }, { "completion_length": 511.75, "epoch": 2.0174611973392462, "grad_norm": 0.0, "kl": 0.2831152677536011, "learning_rate": 3.272958838661361e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7279 }, { "completion_length": 546.25, "epoch": 2.0177383592017737, "grad_norm": 0.0, "kl": 203865440256.0, "learning_rate": 3.272542485937369e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7280 }, { "completion_length": 514.75, "epoch": 2.0180155210643016, "grad_norm": 0.0, "kl": 0.19722425937652588, "learning_rate": 3.2721261095227126e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7281 }, { "completion_length": 502.5, "epoch": 2.018292682926829, "grad_norm": 0.0, "kl": 0.20675145089626312, "learning_rate": 3.2717097094301605e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7282 }, { "completion_length": 504.5, "epoch": 2.018569844789357, "grad_norm": 0.0, "kl": 0.2176118642091751, "learning_rate": 3.2712932856724815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7283 }, { "completion_length": 474.75, "epoch": 2.018847006651885, "grad_norm": 0.0, "kl": 0.22008392214775085, "learning_rate": 3.2708768382624455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7284 }, { "completion_length": 630.75, "epoch": 2.0191241685144123, "grad_norm": 0.0, "kl": 314.141357421875, "learning_rate": 3.270460367212825e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7285 }, { "completion_length": 477.25, "epoch": 2.0194013303769403, "grad_norm": 0.0, "kl": 3888.998046875, "learning_rate": 3.270043872536389e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7286 }, { "completion_length": 440.0, "epoch": 2.0196784922394677, "grad_norm": 0.0, "kl": 0.26458144187927246, "learning_rate": 3.2696273542459114e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7287 }, { "completion_length": 490.25, "epoch": 2.0199556541019956, "grad_norm": 0.0, "kl": 0.613836407661438, "learning_rate": 3.2692108123541638e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7288 }, { "completion_length": 521.25, "epoch": 2.020232815964523, "grad_norm": 0.0, "kl": 0.2227359414100647, "learning_rate": 3.2687942468739208e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7289 }, { "completion_length": 624.25, "epoch": 2.020509977827051, "grad_norm": 0.0, "kl": 0.34453919529914856, "learning_rate": 3.268377657817956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7290 }, { "completion_length": 476.75, "epoch": 2.020787139689579, "grad_norm": 0.0, "kl": 0.2646888494491577, "learning_rate": 3.2679610451990455e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7291 }, { "completion_length": 515.25, "epoch": 2.0210643015521064, "grad_norm": 0.0, "kl": 0.3025100827217102, "learning_rate": 3.267544409029964e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7292 }, { "completion_length": 484.0, "epoch": 2.0213414634146343, "grad_norm": 0.0, "kl": 0.18869878351688385, "learning_rate": 3.267127749323489e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7293 }, { "completion_length": 543.75, "epoch": 2.0216186252771617, "grad_norm": 0.0, "kl": 0.20172329246997833, "learning_rate": 3.266711066092396e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7294 }, { "completion_length": 706.25, "epoch": 2.0218957871396896, "grad_norm": 0.0, "kl": 0.3494556248188019, "learning_rate": 3.2662943593494646e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7295 }, { "completion_length": 572.75, "epoch": 2.022172949002217, "grad_norm": 0.0, "kl": 0.16009099781513214, "learning_rate": 3.2658776291074724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7296 }, { "completion_length": 564.0, "epoch": 2.022450110864745, "grad_norm": 0.0, "kl": 0.22426766157150269, "learning_rate": 3.2654608753792e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7297 }, { "completion_length": 526.25, "epoch": 2.022727272727273, "grad_norm": 0.0, "kl": 0.19180460274219513, "learning_rate": 3.265044098177426e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7298 }, { "completion_length": 585.75, "epoch": 2.0230044345898004, "grad_norm": 0.0, "kl": 0.34847742319107056, "learning_rate": 3.2646272975149324e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7299 }, { "completion_length": 502.25, "epoch": 2.0232815964523283, "grad_norm": 0.0, "kl": 0.3302238881587982, "learning_rate": 3.2642104734045012e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7300 }, { "completion_length": 598.75, "epoch": 2.0235587583148558, "grad_norm": 0.0, "kl": 0.2244248241186142, "learning_rate": 3.2637936258589126e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7301 }, { "completion_length": 578.0, "epoch": 2.0238359201773837, "grad_norm": 0.0, "kl": 0.1802038997411728, "learning_rate": 3.2633767548909513e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7302 }, { "completion_length": 543.0, "epoch": 2.024113082039911, "grad_norm": 0.0, "kl": 0.20430216193199158, "learning_rate": 3.2629598605134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7303 }, { "completion_length": 552.5, "epoch": 2.024390243902439, "grad_norm": 0.9794589281082153, "kl": 4082421727232.0, "learning_rate": 3.262542942739044e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7304 }, { "completion_length": 617.25, "epoch": 2.024667405764967, "grad_norm": 0.0, "kl": 0.17447011172771454, "learning_rate": 3.2621260015806685e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7305 }, { "completion_length": 551.5, "epoch": 2.0249445676274944, "grad_norm": 0.39349931478500366, "kl": 0.27354854345321655, "learning_rate": 3.2617090370510584e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7306 }, { "completion_length": 504.0, "epoch": 2.0252217294900223, "grad_norm": 0.0, "kl": 0.20331691205501556, "learning_rate": 3.2612920491630008e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7307 }, { "completion_length": 546.25, "epoch": 2.02549889135255, "grad_norm": 0.0, "kl": 0.2015232890844345, "learning_rate": 3.2608750379292836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7308 }, { "completion_length": 519.25, "epoch": 2.0257760532150777, "grad_norm": 0.0, "kl": 0.19975177943706512, "learning_rate": 3.2604580033626933e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7309 }, { "completion_length": 562.0, "epoch": 2.026053215077605, "grad_norm": 0.0, "kl": 0.18293112516403198, "learning_rate": 3.2600409454760197e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7310 }, { "completion_length": 517.5, "epoch": 2.026330376940133, "grad_norm": 0.0, "kl": 0.40718960762023926, "learning_rate": 3.2596238642820517e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7311 }, { "completion_length": 553.25, "epoch": 2.0266075388026605, "grad_norm": 0.0, "kl": 0.20396196842193604, "learning_rate": 3.2592067597935806e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7312 }, { "completion_length": 600.5, "epoch": 2.0268847006651884, "grad_norm": 0.0, "kl": 0.20722375810146332, "learning_rate": 3.258789632023395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7313 }, { "completion_length": 532.0, "epoch": 2.0271618625277164, "grad_norm": 0.0, "kl": 0.17830128967761993, "learning_rate": 3.2583724809842893e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7314 }, { "completion_length": 610.25, "epoch": 2.027439024390244, "grad_norm": 0.0, "kl": 0.187505841255188, "learning_rate": 3.2579553066890535e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7315 }, { "completion_length": 601.75, "epoch": 2.0277161862527717, "grad_norm": 0.0, "kl": 1.6065679788589478, "learning_rate": 3.257538109150482e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7316 }, { "completion_length": 573.75, "epoch": 2.027993348115299, "grad_norm": 0.0, "kl": 0.24351879954338074, "learning_rate": 3.2571208883813678e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7317 }, { "completion_length": 656.25, "epoch": 2.028270509977827, "grad_norm": 0.0, "kl": 0.1665741354227066, "learning_rate": 3.2567036443945054e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7318 }, { "completion_length": 513.75, "epoch": 2.0285476718403546, "grad_norm": 0.0, "kl": 0.21354061365127563, "learning_rate": 3.25628637720269e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7319 }, { "completion_length": 571.25, "epoch": 2.0288248337028825, "grad_norm": 0.0, "kl": 0.17185643315315247, "learning_rate": 3.255869086818717e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7320 }, { "completion_length": 651.25, "epoch": 2.0291019955654104, "grad_norm": 0.0, "kl": 0.16225099563598633, "learning_rate": 3.255451773255385e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7321 }, { "completion_length": 536.0, "epoch": 2.029379157427938, "grad_norm": 0.0, "kl": 0.19093121588230133, "learning_rate": 3.2550344365254884e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7322 }, { "completion_length": 475.25, "epoch": 2.0296563192904657, "grad_norm": 0.0, "kl": 0.20527248084545135, "learning_rate": 3.2546170766418268e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7323 }, { "completion_length": 551.0, "epoch": 2.029933481152993, "grad_norm": 0.0, "kl": 0.18034937977790833, "learning_rate": 3.2541996936171983e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7324 }, { "completion_length": 557.75, "epoch": 2.030210643015521, "grad_norm": 0.0, "kl": 0.16238506138324738, "learning_rate": 3.2537822874644026e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7325 }, { "completion_length": 712.75, "epoch": 2.0304878048780486, "grad_norm": 0.0, "kl": 0.1540173441171646, "learning_rate": 3.25336485819624e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7326 }, { "completion_length": 642.0, "epoch": 2.0307649667405765, "grad_norm": 1.3431822061538696, "kl": 80.8061752319336, "learning_rate": 3.2529474058255117e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7327 }, { "completion_length": 448.25, "epoch": 2.0310421286031044, "grad_norm": 0.0, "kl": 0.2510741353034973, "learning_rate": 3.252529930365018e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7328 }, { "completion_length": 488.25, "epoch": 2.031319290465632, "grad_norm": 0.0, "kl": 0.28278660774230957, "learning_rate": 3.2521124318275623e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7329 }, { "completion_length": 522.25, "epoch": 2.0315964523281598, "grad_norm": 0.0, "kl": 0.1912241280078888, "learning_rate": 3.2516949102259466e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7330 }, { "completion_length": 515.25, "epoch": 2.0318736141906872, "grad_norm": 0.0, "kl": 0.23790821433067322, "learning_rate": 3.2512773655729746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7331 }, { "completion_length": 505.5, "epoch": 2.032150776053215, "grad_norm": 0.0, "kl": 0.22674676775932312, "learning_rate": 3.2508597978814515e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7332 }, { "completion_length": 487.75, "epoch": 2.0324279379157426, "grad_norm": 0.0, "kl": 0.2005789875984192, "learning_rate": 3.250442207164182e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7333 }, { "completion_length": 489.5, "epoch": 2.0327050997782705, "grad_norm": 0.0, "kl": 1482436509696.0, "learning_rate": 3.2500245934339714e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7334 }, { "completion_length": 561.0, "epoch": 2.0329822616407984, "grad_norm": 0.0, "kl": 107866415104.0, "learning_rate": 3.2496069567036265e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7335 }, { "completion_length": 700.25, "epoch": 2.033259423503326, "grad_norm": 0.0, "kl": 0.1792103350162506, "learning_rate": 3.249189296985955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7336 }, { "completion_length": 561.0, "epoch": 2.033536585365854, "grad_norm": 0.0, "kl": 0.18351300060749054, "learning_rate": 3.248771614293764e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7337 }, { "completion_length": 485.0, "epoch": 2.0338137472283813, "grad_norm": 0.5878324508666992, "kl": 4535.5283203125, "learning_rate": 3.248353908639863e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7338 }, { "completion_length": 589.75, "epoch": 2.034090909090909, "grad_norm": 0.0, "kl": 26.51211166381836, "learning_rate": 3.2479361800370596e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7339 }, { "completion_length": 573.5, "epoch": 2.0343680709534366, "grad_norm": 0.0, "kl": 0.18841685354709625, "learning_rate": 3.247518428498166e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7340 }, { "completion_length": 569.25, "epoch": 2.0346452328159645, "grad_norm": 0.0, "kl": 0.19517368078231812, "learning_rate": 3.2471006540359907e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7341 }, { "completion_length": 515.75, "epoch": 2.0349223946784925, "grad_norm": 0.48450008034706116, "kl": 0.20760579407215118, "learning_rate": 3.2466828566633473e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7342 }, { "completion_length": 643.5, "epoch": 2.03519955654102, "grad_norm": 0.0, "kl": 0.18638864159584045, "learning_rate": 3.246265036393046e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7343 }, { "completion_length": 499.5, "epoch": 2.035476718403548, "grad_norm": 0.0, "kl": 0.27326953411102295, "learning_rate": 3.245847193237901e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7344 }, { "completion_length": 528.75, "epoch": 2.0357538802660753, "grad_norm": 0.4110112488269806, "kl": 16554345365504.0, "learning_rate": 3.245429327210725e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7345 }, { "completion_length": 534.0, "epoch": 2.036031042128603, "grad_norm": 0.0, "kl": 0.2533685863018036, "learning_rate": 3.2450114383243323e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7346 }, { "completion_length": 597.25, "epoch": 2.0363082039911307, "grad_norm": 0.0, "kl": 0.22612151503562927, "learning_rate": 3.244593526591538e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7347 }, { "completion_length": 559.75, "epoch": 2.0365853658536586, "grad_norm": 0.0, "kl": 0.6196643114089966, "learning_rate": 3.244175592025158e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7348 }, { "completion_length": 525.0, "epoch": 2.0368625277161865, "grad_norm": 0.0, "kl": 0.31540077924728394, "learning_rate": 3.2437576346380077e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7349 }, { "completion_length": 548.0, "epoch": 2.037139689578714, "grad_norm": 0.0, "kl": 0.30397501587867737, "learning_rate": 3.243339654442905e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7350 }, { "completion_length": 501.75, "epoch": 2.037416851441242, "grad_norm": 0.0, "kl": 0.22079233825206757, "learning_rate": 3.2429216514526673e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7351 }, { "completion_length": 497.5, "epoch": 2.0376940133037693, "grad_norm": 0.0, "kl": 0.220035120844841, "learning_rate": 3.2425036256801135e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7352 }, { "completion_length": 597.75, "epoch": 2.037971175166297, "grad_norm": 0.0, "kl": 14.388387680053711, "learning_rate": 3.2420855771380616e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7353 }, { "completion_length": 594.25, "epoch": 2.0382483370288247, "grad_norm": 0.0, "kl": 0.299100786447525, "learning_rate": 3.2416675058393317e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7354 }, { "completion_length": 484.5, "epoch": 2.0385254988913526, "grad_norm": 0.0, "kl": 0.21556507050991058, "learning_rate": 3.2412494117967456e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7355 }, { "completion_length": 588.25, "epoch": 2.03880266075388, "grad_norm": 1.2216613292694092, "kl": 711492763648.0, "learning_rate": 3.240831295023123e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7356 }, { "completion_length": 510.25, "epoch": 2.039079822616408, "grad_norm": 0.4409240186214447, "kl": 89552609869824.0, "learning_rate": 3.240413155531287e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7357 }, { "completion_length": 611.0, "epoch": 2.039356984478936, "grad_norm": 0.0, "kl": 0.1934439241886139, "learning_rate": 3.239994993334059e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7358 }, { "completion_length": 577.25, "epoch": 2.0396341463414633, "grad_norm": 0.0, "kl": 0.37658244371414185, "learning_rate": 3.239576808444263e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7359 }, { "completion_length": 558.5, "epoch": 2.0399113082039912, "grad_norm": 0.0, "kl": 0.9297658801078796, "learning_rate": 3.239158600874723e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7360 }, { "completion_length": 565.5, "epoch": 2.0401884700665187, "grad_norm": 0.0, "kl": 0.4197952449321747, "learning_rate": 3.2387403706382636e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7361 }, { "completion_length": 599.75, "epoch": 2.0404656319290466, "grad_norm": 0.0, "kl": 0.21762534976005554, "learning_rate": 3.2383221177477087e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7362 }, { "completion_length": 588.0, "epoch": 2.040742793791574, "grad_norm": 0.0, "kl": 0.5104728937149048, "learning_rate": 3.2379038422158876e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7363 }, { "completion_length": 446.5, "epoch": 2.041019955654102, "grad_norm": 0.0, "kl": 0.280844509601593, "learning_rate": 3.2374855440556242e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7364 }, { "completion_length": 605.75, "epoch": 2.04129711751663, "grad_norm": 0.0, "kl": 0.22619926929473877, "learning_rate": 3.237067223279748e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7365 }, { "completion_length": 542.5, "epoch": 2.0415742793791574, "grad_norm": 0.0, "kl": 0.19861672818660736, "learning_rate": 3.2366488799010855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7366 }, { "completion_length": 553.5, "epoch": 2.0418514412416853, "grad_norm": 0.0, "kl": 7249939398656.0, "learning_rate": 3.2362305139324656e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7367 }, { "completion_length": 665.25, "epoch": 2.0421286031042127, "grad_norm": 0.0, "kl": 0.21033617854118347, "learning_rate": 3.2358121253867204e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7368 }, { "completion_length": 461.25, "epoch": 2.0424057649667406, "grad_norm": 0.0, "kl": 0.26184895634651184, "learning_rate": 3.2353937142766755e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7369 }, { "completion_length": 515.5, "epoch": 2.042682926829268, "grad_norm": 0.0, "kl": 0.5076331496238708, "learning_rate": 3.234975280615167e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7370 }, { "completion_length": 490.5, "epoch": 2.042960088691796, "grad_norm": 0.0, "kl": 0.2038716822862625, "learning_rate": 3.234556824415023e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7371 }, { "completion_length": 485.25, "epoch": 2.043237250554324, "grad_norm": 0.0, "kl": 0.5223156213760376, "learning_rate": 3.2341383456890776e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7372 }, { "completion_length": 649.5, "epoch": 2.0435144124168514, "grad_norm": 0.0, "kl": 0.16663320362567902, "learning_rate": 3.233719844450162e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7373 }, { "completion_length": 576.0, "epoch": 2.0437915742793793, "grad_norm": 0.0, "kl": 0.1914564073085785, "learning_rate": 3.233301320711112e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7374 }, { "completion_length": 715.75, "epoch": 2.0440687361419068, "grad_norm": 0.0, "kl": 0.1516938954591751, "learning_rate": 3.2328827744847603e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7375 }, { "completion_length": 595.0, "epoch": 2.0443458980044347, "grad_norm": 0.0, "kl": 0.24960248172283173, "learning_rate": 3.232464205783943e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7376 }, { "completion_length": 579.5, "epoch": 2.044623059866962, "grad_norm": 0.0, "kl": 0.1879509836435318, "learning_rate": 3.2320456146214946e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7377 }, { "completion_length": 543.75, "epoch": 2.04490022172949, "grad_norm": 0.0, "kl": 0.300462007522583, "learning_rate": 3.2316270010102535e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7378 }, { "completion_length": 501.5, "epoch": 2.045177383592018, "grad_norm": 0.0, "kl": 0.3243182301521301, "learning_rate": 3.231208364963055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7379 }, { "completion_length": 586.0, "epoch": 2.0454545454545454, "grad_norm": 1.2784637212753296, "kl": 2014695552.0, "learning_rate": 3.2307897064927383e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7380 }, { "completion_length": 550.25, "epoch": 2.0457317073170733, "grad_norm": 0.0, "kl": 414781312.0, "learning_rate": 3.230371025612141e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7381 }, { "completion_length": 524.5, "epoch": 2.046008869179601, "grad_norm": 0.0, "kl": 0.21020513772964478, "learning_rate": 3.2299523223341025e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7382 }, { "completion_length": 586.5, "epoch": 2.0462860310421287, "grad_norm": 0.0, "kl": 0.1837489902973175, "learning_rate": 3.229533596671463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7383 }, { "completion_length": 507.75, "epoch": 2.046563192904656, "grad_norm": 0.0, "kl": 0.1869412064552307, "learning_rate": 3.2291148486370626e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7384 }, { "completion_length": 612.25, "epoch": 2.046840354767184, "grad_norm": 0.0, "kl": 0.17505142092704773, "learning_rate": 3.228696078243743e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7385 }, { "completion_length": 518.25, "epoch": 2.0471175166297115, "grad_norm": 0.0, "kl": 0.2025497555732727, "learning_rate": 3.228277285504346e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7386 }, { "completion_length": 506.0, "epoch": 2.0473946784922394, "grad_norm": 0.0, "kl": 0.22738862037658691, "learning_rate": 3.2278584704317144e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7387 }, { "completion_length": 595.75, "epoch": 2.0476718403547673, "grad_norm": 0.0, "kl": 0.22068633139133453, "learning_rate": 3.2274396330386915e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7388 }, { "completion_length": 548.75, "epoch": 2.047949002217295, "grad_norm": 0.0, "kl": 0.25445687770843506, "learning_rate": 3.2270207733381208e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7389 }, { "completion_length": 555.5, "epoch": 2.0482261640798227, "grad_norm": 0.0, "kl": 41838.15234375, "learning_rate": 3.2266018913428475e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7390 }, { "completion_length": 538.25, "epoch": 2.04850332594235, "grad_norm": 0.0, "kl": 41614012416.0, "learning_rate": 3.226182987065717e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7391 }, { "completion_length": 556.25, "epoch": 2.048780487804878, "grad_norm": 0.0, "kl": 6.693345069885254, "learning_rate": 3.225764060519574e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7392 }, { "completion_length": 512.25, "epoch": 2.0490576496674056, "grad_norm": 0.0, "kl": 0.22345475852489471, "learning_rate": 3.225345111717267e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7393 }, { "completion_length": 513.5, "epoch": 2.0493348115299335, "grad_norm": 0.0, "kl": 0.16582240164279938, "learning_rate": 3.224926140671643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7394 }, { "completion_length": 580.75, "epoch": 2.0496119733924614, "grad_norm": 0.0, "kl": 0.18953607976436615, "learning_rate": 3.2245071473955504e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7395 }, { "completion_length": 623.5, "epoch": 2.049889135254989, "grad_norm": 0.0, "kl": 0.1767101138830185, "learning_rate": 3.2240881319018364e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7396 }, { "completion_length": 545.75, "epoch": 2.0501662971175167, "grad_norm": 0.0, "kl": 0.25815096497535706, "learning_rate": 3.2236690942033523e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7397 }, { "completion_length": 466.0, "epoch": 2.050443458980044, "grad_norm": 0.0, "kl": 0.26026320457458496, "learning_rate": 3.223250034312947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7398 }, { "completion_length": 514.25, "epoch": 2.050720620842572, "grad_norm": 0.0, "kl": 0.16547895967960358, "learning_rate": 3.222830952243472e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7399 }, { "completion_length": 477.5, "epoch": 2.0509977827050996, "grad_norm": 0.0, "kl": 0.2471318542957306, "learning_rate": 3.2224118480077784e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7400 }, { "completion_length": 631.25, "epoch": 2.0512749445676275, "grad_norm": 0.42736175656318665, "kl": 111861702656.0, "learning_rate": 3.2219927216187186e-06, "loss": -0.0, "reward": 4.125, "reward_std": 1.25, "rewards/confident_score_func": 0.875, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7401 }, { "completion_length": 607.25, "epoch": 2.0515521064301554, "grad_norm": 0.0, "kl": 0.1501370668411255, "learning_rate": 3.2215735730891455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7402 }, { "completion_length": 512.75, "epoch": 2.051829268292683, "grad_norm": 0.0, "kl": 179265248.0, "learning_rate": 3.2211544024319124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7403 }, { "completion_length": 671.0, "epoch": 2.0521064301552108, "grad_norm": 2.1139533519744873, "kl": 2994277632.0, "learning_rate": 3.2207352096598743e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7404 }, { "completion_length": 517.5, "epoch": 2.0523835920177382, "grad_norm": 0.0, "kl": 0.6914373636245728, "learning_rate": 3.2203159947858853e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7405 }, { "completion_length": 485.25, "epoch": 2.052660753880266, "grad_norm": 0.0, "kl": 0.300820916891098, "learning_rate": 3.2198967578228013e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7406 }, { "completion_length": 686.75, "epoch": 2.0529379157427936, "grad_norm": 0.41633617877960205, "kl": 0.16716156899929047, "learning_rate": 3.2194774987834777e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7407 }, { "completion_length": 569.75, "epoch": 2.0532150776053215, "grad_norm": 2.6069774627685547, "kl": 0.2145470529794693, "learning_rate": 3.219058217680773e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7408 }, { "completion_length": 574.0, "epoch": 2.0534922394678494, "grad_norm": 0.0, "kl": 0.21150881052017212, "learning_rate": 3.218638914527543e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7409 }, { "completion_length": 491.75, "epoch": 2.053769401330377, "grad_norm": 0.0, "kl": 33654187229184.0, "learning_rate": 3.218219589336648e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7410 }, { "completion_length": 414.75, "epoch": 2.054046563192905, "grad_norm": 0.0, "kl": 0.27820470929145813, "learning_rate": 3.2178002421209455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7411 }, { "completion_length": 558.0, "epoch": 2.0543237250554323, "grad_norm": 0.0, "kl": 0.1657363474369049, "learning_rate": 3.2173808728932953e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7412 }, { "completion_length": 420.75, "epoch": 2.05460088691796, "grad_norm": 0.0, "kl": 0.20706546306610107, "learning_rate": 3.2169614816665584e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7413 }, { "completion_length": 482.75, "epoch": 2.0548780487804876, "grad_norm": 0.0, "kl": 0.2177157998085022, "learning_rate": 3.216542068453595e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7414 }, { "completion_length": 494.75, "epoch": 2.0551552106430155, "grad_norm": 0.0, "kl": 0.2740674316883087, "learning_rate": 3.2161226332672667e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7415 }, { "completion_length": 530.25, "epoch": 2.0554323725055434, "grad_norm": 0.0, "kl": 24606324.0, "learning_rate": 3.215703176120437e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7416 }, { "completion_length": 518.0, "epoch": 2.055709534368071, "grad_norm": 0.4704115092754364, "kl": 0.20976556837558746, "learning_rate": 3.2152836970259683e-06, "loss": -0.0, "reward": 4.625, "reward_std": 2.25, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7417 }, { "completion_length": 577.5, "epoch": 2.055986696230599, "grad_norm": 0.41956833004951477, "kl": 12683362435072.0, "learning_rate": 3.214864195996723e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7418 }, { "completion_length": 508.0, "epoch": 2.0562638580931263, "grad_norm": 0.0, "kl": 10352310272.0, "learning_rate": 3.214444673045568e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7419 }, { "completion_length": 470.5, "epoch": 2.056541019955654, "grad_norm": 0.0, "kl": 728312184832.0, "learning_rate": 3.214025128185366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7420 }, { "completion_length": 580.0, "epoch": 2.0568181818181817, "grad_norm": 0.36761710047721863, "kl": 0.16263872385025024, "learning_rate": 3.2136055614289843e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7421 }, { "completion_length": 512.75, "epoch": 2.0570953436807096, "grad_norm": 0.0, "kl": 0.18417318165302277, "learning_rate": 3.2131859727892873e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7422 }, { "completion_length": 550.0, "epoch": 2.0573725055432375, "grad_norm": 0.0, "kl": 0.20637446641921997, "learning_rate": 3.2127663622791445e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7423 }, { "completion_length": 538.25, "epoch": 2.057649667405765, "grad_norm": 0.0, "kl": 0.21139270067214966, "learning_rate": 3.2123467299114216e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7424 }, { "completion_length": 589.0, "epoch": 2.057926829268293, "grad_norm": 0.0, "kl": 0.17405171692371368, "learning_rate": 3.2119270756989883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7425 }, { "completion_length": 519.5, "epoch": 2.0582039911308203, "grad_norm": 0.7328740954399109, "kl": 203790.5625, "learning_rate": 3.211507399654713e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7426 }, { "completion_length": 531.5, "epoch": 2.058481152993348, "grad_norm": 0.0, "kl": 0.23311465978622437, "learning_rate": 3.2110877017914653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7427 }, { "completion_length": 576.25, "epoch": 2.0587583148558757, "grad_norm": 0.0, "kl": 0.18376211822032928, "learning_rate": 3.2106679821221165e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7428 }, { "completion_length": 539.75, "epoch": 2.0590354767184036, "grad_norm": 0.0, "kl": 0.17561598122119904, "learning_rate": 3.2102482406595357e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7429 }, { "completion_length": 437.25, "epoch": 2.059312638580931, "grad_norm": 0.0, "kl": 0.23096835613250732, "learning_rate": 3.2098284774165967e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7430 }, { "completion_length": 491.25, "epoch": 2.059589800443459, "grad_norm": 0.0, "kl": 0.19087758660316467, "learning_rate": 3.2094086924061717e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7431 }, { "completion_length": 473.5, "epoch": 2.059866962305987, "grad_norm": 0.0, "kl": 0.19635362923145294, "learning_rate": 3.2089888856411323e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7432 }, { "completion_length": 579.25, "epoch": 2.0601441241685143, "grad_norm": 0.0, "kl": 0.1796785295009613, "learning_rate": 3.208569057134353e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7433 }, { "completion_length": 626.75, "epoch": 2.0604212860310422, "grad_norm": 0.0, "kl": 0.1963685154914856, "learning_rate": 3.20814920689871e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7434 }, { "completion_length": 658.0, "epoch": 2.0606984478935697, "grad_norm": 0.0, "kl": 0.1924395114183426, "learning_rate": 3.2077293349470744e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7435 }, { "completion_length": 580.25, "epoch": 2.0609756097560976, "grad_norm": 0.0, "kl": 0.1876855492591858, "learning_rate": 3.207309441292325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7436 }, { "completion_length": 502.0, "epoch": 2.061252771618625, "grad_norm": 0.0, "kl": 3.819844961166382, "learning_rate": 3.2068895259473375e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7437 }, { "completion_length": 619.25, "epoch": 2.061529933481153, "grad_norm": 0.0, "kl": 0.2980836033821106, "learning_rate": 3.206469588924989e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7438 }, { "completion_length": 527.25, "epoch": 2.061807095343681, "grad_norm": 0.0, "kl": 0.6067603230476379, "learning_rate": 3.206049630238157e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7439 }, { "completion_length": 525.25, "epoch": 2.0620842572062084, "grad_norm": 0.0, "kl": 0.1938093900680542, "learning_rate": 3.2056296498997202e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7440 }, { "completion_length": 604.0, "epoch": 2.0623614190687363, "grad_norm": 0.0, "kl": 0.1639467179775238, "learning_rate": 3.205209647922557e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7441 }, { "completion_length": 562.5, "epoch": 2.0626385809312637, "grad_norm": 0.0, "kl": 0.8463837504386902, "learning_rate": 3.2047896243195477e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7442 }, { "completion_length": 649.25, "epoch": 2.0629157427937916, "grad_norm": 0.0, "kl": 0.16613991558551788, "learning_rate": 3.2043695791035723e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7443 }, { "completion_length": 613.75, "epoch": 2.063192904656319, "grad_norm": 0.0, "kl": 14.940317153930664, "learning_rate": 3.2039495122875113e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7444 }, { "completion_length": 491.75, "epoch": 2.063470066518847, "grad_norm": 0.0, "kl": 0.24084331095218658, "learning_rate": 3.2035294238842478e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7445 }, { "completion_length": 584.75, "epoch": 2.063747228381375, "grad_norm": 1.3713536262512207, "kl": 961359314944.0, "learning_rate": 3.2031093139066637e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7446 }, { "completion_length": 578.0, "epoch": 2.0640243902439024, "grad_norm": 0.0, "kl": 0.1924433708190918, "learning_rate": 3.2026891823676413e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7447 }, { "completion_length": 495.75, "epoch": 2.0643015521064303, "grad_norm": 0.0, "kl": 0.1800786256790161, "learning_rate": 3.202269029280065e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7448 }, { "completion_length": 539.25, "epoch": 2.0645787139689578, "grad_norm": 0.0, "kl": 0.19369420409202576, "learning_rate": 3.2018488546568194e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7449 }, { "completion_length": 510.25, "epoch": 2.0648558758314857, "grad_norm": 0.0, "kl": 0.20876874029636383, "learning_rate": 3.2014286585107883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7450 }, { "completion_length": 491.0, "epoch": 2.065133037694013, "grad_norm": 0.0, "kl": 0.200459286570549, "learning_rate": 3.2010084408548587e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7451 }, { "completion_length": 647.5, "epoch": 2.065410199556541, "grad_norm": 0.0, "kl": 0.16654245555400848, "learning_rate": 3.2005882017019156e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7452 }, { "completion_length": 535.0, "epoch": 2.065687361419069, "grad_norm": 0.0, "kl": 0.2803683280944824, "learning_rate": 3.200167941064848e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7453 }, { "completion_length": 584.75, "epoch": 2.0659645232815964, "grad_norm": 0.0, "kl": 0.14366453886032104, "learning_rate": 3.199747658956541e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7454 }, { "completion_length": 656.25, "epoch": 2.0662416851441243, "grad_norm": 0.5758587121963501, "kl": 184722309120.0, "learning_rate": 3.1993273553898853e-06, "loss": 0.0, "reward": 2.40625, "reward_std": 2.3214914798736572, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.65625, "step": 7455 }, { "completion_length": 567.75, "epoch": 2.066518847006652, "grad_norm": 0.0, "kl": 0.19247794151306152, "learning_rate": 3.198907030377768e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7456 }, { "completion_length": 586.5, "epoch": 2.0667960088691797, "grad_norm": 0.0, "kl": 0.1774129569530487, "learning_rate": 3.1984866839330797e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7457 }, { "completion_length": 631.75, "epoch": 2.067073170731707, "grad_norm": 0.0, "kl": 0.1716151237487793, "learning_rate": 3.19806631606871e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7458 }, { "completion_length": 585.75, "epoch": 2.067350332594235, "grad_norm": 0.0, "kl": 0.20400017499923706, "learning_rate": 3.1976459267975506e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7459 }, { "completion_length": 629.0, "epoch": 2.0676274944567625, "grad_norm": 0.0, "kl": 0.31868264079093933, "learning_rate": 3.1972255161324928e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7460 }, { "completion_length": 554.25, "epoch": 2.0679046563192904, "grad_norm": 0.0, "kl": 0.22610323131084442, "learning_rate": 3.196805084086429e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7461 }, { "completion_length": 567.75, "epoch": 2.0681818181818183, "grad_norm": 0.0, "kl": 0.1811879575252533, "learning_rate": 3.196384630672251e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7462 }, { "completion_length": 564.5, "epoch": 2.068458980044346, "grad_norm": 0.0, "kl": 0.19144076108932495, "learning_rate": 3.1959641559028546e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7463 }, { "completion_length": 481.5, "epoch": 2.0687361419068737, "grad_norm": 0.0, "kl": 49015185997824.0, "learning_rate": 3.195543659791132e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7464 }, { "completion_length": 537.25, "epoch": 2.069013303769401, "grad_norm": 1.227454662322998, "kl": 5410122752.0, "learning_rate": 3.1951231423499783e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7465 }, { "completion_length": 498.75, "epoch": 2.069290465631929, "grad_norm": 0.0, "kl": 0.23499499261379242, "learning_rate": 3.1947026035922902e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7466 }, { "completion_length": 576.5, "epoch": 2.0695676274944566, "grad_norm": 0.0, "kl": 0.28637930750846863, "learning_rate": 3.194282043530963e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7467 }, { "completion_length": 585.75, "epoch": 2.0698447893569845, "grad_norm": 0.0, "kl": 0.34612470865249634, "learning_rate": 3.193861462178893e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7468 }, { "completion_length": 499.0, "epoch": 2.0701219512195124, "grad_norm": 0.0, "kl": 0.20715992152690887, "learning_rate": 3.1934408595489793e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7469 }, { "completion_length": 467.0, "epoch": 2.07039911308204, "grad_norm": 0.0, "kl": 0.22957786917686462, "learning_rate": 3.193020235654119e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7470 }, { "completion_length": 656.0, "epoch": 2.0706762749445677, "grad_norm": 0.0, "kl": 0.18439462780952454, "learning_rate": 3.1925995905072107e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7471 }, { "completion_length": 567.0, "epoch": 2.070953436807095, "grad_norm": 0.0, "kl": 0.19015318155288696, "learning_rate": 3.1921789241211544e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7472 }, { "completion_length": 605.25, "epoch": 2.071230598669623, "grad_norm": 0.0, "kl": 0.1926862597465515, "learning_rate": 3.191758236508849e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7473 }, { "completion_length": 649.75, "epoch": 2.0715077605321506, "grad_norm": 0.0, "kl": 0.379720002412796, "learning_rate": 3.1913375276831966e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7474 }, { "completion_length": 640.75, "epoch": 2.0717849223946785, "grad_norm": 0.0, "kl": 0.23775620758533478, "learning_rate": 3.1909167976570977e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7475 }, { "completion_length": 627.0, "epoch": 2.0720620842572064, "grad_norm": 0.0, "kl": 0.15102680027484894, "learning_rate": 3.190496046443456e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7476 }, { "completion_length": 703.0, "epoch": 2.072339246119734, "grad_norm": 0.0, "kl": 0.16029611229896545, "learning_rate": 3.1900752740551715e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7477 }, { "completion_length": 595.75, "epoch": 2.0726164079822618, "grad_norm": 0.0, "kl": 0.2154289186000824, "learning_rate": 3.1896544805051503e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7478 }, { "completion_length": 645.25, "epoch": 2.0728935698447892, "grad_norm": 0.0, "kl": 335463415808.0, "learning_rate": 3.189233665806294e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7479 }, { "completion_length": 544.5, "epoch": 2.073170731707317, "grad_norm": 0.0, "kl": 0.16641397774219513, "learning_rate": 3.188812829971509e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7480 }, { "completion_length": 541.5, "epoch": 2.0734478935698446, "grad_norm": 0.0, "kl": 0.2402142733335495, "learning_rate": 3.1883919730136993e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7481 }, { "completion_length": 506.5, "epoch": 2.0737250554323725, "grad_norm": 0.0, "kl": 0.28119784593582153, "learning_rate": 3.187971094945772e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7482 }, { "completion_length": 447.0, "epoch": 2.0740022172949004, "grad_norm": 0.0, "kl": 0.23267178237438202, "learning_rate": 3.187550195780633e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7483 }, { "completion_length": 629.5, "epoch": 2.074279379157428, "grad_norm": 0.0, "kl": 0.18451207876205444, "learning_rate": 3.1871292755311887e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7484 }, { "completion_length": 521.25, "epoch": 2.074556541019956, "grad_norm": 0.0, "kl": 0.4273624122142792, "learning_rate": 3.1867083342103495e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7485 }, { "completion_length": 530.5, "epoch": 2.0748337028824833, "grad_norm": 0.0, "kl": 0.2102336585521698, "learning_rate": 3.186287371831022e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7486 }, { "completion_length": 665.25, "epoch": 2.075110864745011, "grad_norm": 0.0, "kl": 0.19421157240867615, "learning_rate": 3.185866388406115e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7487 }, { "completion_length": 567.0, "epoch": 2.0753880266075386, "grad_norm": 0.0, "kl": 0.290271520614624, "learning_rate": 3.185445383948539e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7488 }, { "completion_length": 549.25, "epoch": 2.0756651884700665, "grad_norm": 0.0, "kl": 0.21324706077575684, "learning_rate": 3.1850243584712045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7489 }, { "completion_length": 562.25, "epoch": 2.0759423503325944, "grad_norm": 0.0, "kl": 0.24313044548034668, "learning_rate": 3.1846033119870223e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7490 }, { "completion_length": 696.75, "epoch": 2.076219512195122, "grad_norm": 0.0, "kl": 0.18159784376621246, "learning_rate": 3.1841822445089053e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7491 }, { "completion_length": 577.0, "epoch": 2.07649667405765, "grad_norm": 0.0, "kl": 0.2206469476222992, "learning_rate": 3.1837611560497645e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7492 }, { "completion_length": 607.0, "epoch": 2.0767738359201773, "grad_norm": 0.0, "kl": 0.2809286415576935, "learning_rate": 3.183340046622514e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7493 }, { "completion_length": 573.5, "epoch": 2.077050997782705, "grad_norm": 0.0, "kl": 35534385152.0, "learning_rate": 3.1829189162400657e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7494 }, { "completion_length": 576.5, "epoch": 2.0773281596452327, "grad_norm": 0.0, "kl": 0.17375332117080688, "learning_rate": 3.182497764915336e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7495 }, { "completion_length": 667.5, "epoch": 2.0776053215077606, "grad_norm": 0.0, "kl": 10249162457088.0, "learning_rate": 3.1820765926612386e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7496 }, { "completion_length": 468.75, "epoch": 2.0778824833702885, "grad_norm": 0.0, "kl": 0.2160649597644806, "learning_rate": 3.1816553994906896e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7497 }, { "completion_length": 674.0, "epoch": 2.078159645232816, "grad_norm": 5.506011009216309, "kl": 0.21655914187431335, "learning_rate": 3.181234185416605e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7498 }, { "completion_length": 712.25, "epoch": 2.078436807095344, "grad_norm": 0.0, "kl": 0.3166603446006775, "learning_rate": 3.180812950451902e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7499 }, { "completion_length": 729.5, "epoch": 2.0787139689578713, "grad_norm": 0.0, "kl": 0.15693847835063934, "learning_rate": 3.180391694609498e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7500 }, { "completion_length": 599.0, "epoch": 2.078991130820399, "grad_norm": 0.0, "kl": 0.15752039849758148, "learning_rate": 3.1799704179023105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7501 }, { "completion_length": 565.25, "epoch": 2.0792682926829267, "grad_norm": 0.0, "kl": 0.22867292165756226, "learning_rate": 3.1795491203432598e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7502 }, { "completion_length": 576.5, "epoch": 2.0795454545454546, "grad_norm": 0.0, "kl": 0.21455872058868408, "learning_rate": 3.179127801945264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7503 }, { "completion_length": 543.0, "epoch": 2.079822616407982, "grad_norm": 0.0, "kl": 0.24000369012355804, "learning_rate": 3.178706462721243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7504 }, { "completion_length": 447.5, "epoch": 2.08009977827051, "grad_norm": 0.0, "kl": 0.24724113941192627, "learning_rate": 3.178285102684119e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7505 }, { "completion_length": 597.0, "epoch": 2.080376940133038, "grad_norm": 0.0, "kl": 0.2116355299949646, "learning_rate": 3.1778637218468124e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7506 }, { "completion_length": 475.5, "epoch": 2.0806541019955653, "grad_norm": 0.0, "kl": 0.2974589169025421, "learning_rate": 3.177442320222245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7507 }, { "completion_length": 566.0, "epoch": 2.0809312638580932, "grad_norm": 0.0, "kl": 0.2234015166759491, "learning_rate": 3.17702089782334e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7508 }, { "completion_length": 660.75, "epoch": 2.0812084257206207, "grad_norm": 0.0, "kl": 0.19138789176940918, "learning_rate": 3.17659945466302e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7509 }, { "completion_length": 596.75, "epoch": 2.0814855875831486, "grad_norm": 0.3731689155101776, "kl": 0.19771240651607513, "learning_rate": 3.17617799075421e-06, "loss": -0.0, "reward": 5.59375, "reward_std": 0.3125, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 7510 }, { "completion_length": 615.25, "epoch": 2.081762749445676, "grad_norm": 0.0, "kl": 0.24003590643405914, "learning_rate": 3.1757565061098328e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7511 }, { "completion_length": 529.5, "epoch": 2.082039911308204, "grad_norm": 0.0, "kl": 0.20724372565746307, "learning_rate": 3.175335000742815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7512 }, { "completion_length": 532.0, "epoch": 2.082317073170732, "grad_norm": 0.0, "kl": 0.6817724704742432, "learning_rate": 3.1749134746660824e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7513 }, { "completion_length": 621.5, "epoch": 2.0825942350332594, "grad_norm": 0.0, "kl": 0.18294544517993927, "learning_rate": 3.174491927892561e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7514 }, { "completion_length": 823.5, "epoch": 2.0828713968957873, "grad_norm": 0.0, "kl": 34960912154624.0, "learning_rate": 3.174070360435178e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7515 }, { "completion_length": 533.5, "epoch": 2.0831485587583147, "grad_norm": 0.0, "kl": 0.17585600912570953, "learning_rate": 3.173648772306861e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7516 }, { "completion_length": 591.5, "epoch": 2.0834257206208426, "grad_norm": 0.0, "kl": 0.15404915809631348, "learning_rate": 3.1732271635205394e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7517 }, { "completion_length": 540.0, "epoch": 2.08370288248337, "grad_norm": 0.0, "kl": 0.3242274820804596, "learning_rate": 3.1728055340891395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7518 }, { "completion_length": 609.25, "epoch": 2.083980044345898, "grad_norm": 0.0, "kl": 0.16419865190982819, "learning_rate": 3.172383884025594e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7519 }, { "completion_length": 554.25, "epoch": 2.084257206208426, "grad_norm": 0.0, "kl": 0.183963343501091, "learning_rate": 3.1719622133428306e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7520 }, { "completion_length": 469.0, "epoch": 2.0845343680709534, "grad_norm": 3.8250887393951416, "kl": 0.4207191467285156, "learning_rate": 3.1715405220537826e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7521 }, { "completion_length": 521.25, "epoch": 2.0848115299334813, "grad_norm": 0.0, "kl": 0.513599157333374, "learning_rate": 3.1711188101713796e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7522 }, { "completion_length": 676.0, "epoch": 2.0850886917960088, "grad_norm": 0.0, "kl": 0.15842682123184204, "learning_rate": 3.1706970777085556e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7523 }, { "completion_length": 531.25, "epoch": 2.0853658536585367, "grad_norm": 0.0, "kl": 0.14650624990463257, "learning_rate": 3.1702753246782407e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7524 }, { "completion_length": 583.5, "epoch": 2.085643015521064, "grad_norm": 0.0, "kl": 0.1674233376979828, "learning_rate": 3.1698535510933715e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7525 }, { "completion_length": 608.25, "epoch": 2.085920177383592, "grad_norm": 0.0, "kl": 0.35576552152633667, "learning_rate": 3.1694317569668785e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7526 }, { "completion_length": 580.25, "epoch": 2.08619733924612, "grad_norm": 0.0, "kl": 0.1855221688747406, "learning_rate": 3.1690099423117e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7527 }, { "completion_length": 574.5, "epoch": 2.0864745011086474, "grad_norm": 0.0, "kl": 0.1729222536087036, "learning_rate": 3.1685881071407686e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7528 }, { "completion_length": 687.25, "epoch": 2.0867516629711753, "grad_norm": 0.0, "kl": 0.26031509041786194, "learning_rate": 3.168166251467022e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7529 }, { "completion_length": 528.0, "epoch": 2.087028824833703, "grad_norm": 0.0, "kl": 0.21586771309375763, "learning_rate": 3.167744375303395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7530 }, { "completion_length": 703.5, "epoch": 2.0873059866962307, "grad_norm": 0.0, "kl": 0.17804566025733948, "learning_rate": 3.167322478662826e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7531 }, { "completion_length": 472.75, "epoch": 2.087583148558758, "grad_norm": 0.0, "kl": 0.202853724360466, "learning_rate": 3.166900561558253e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7532 }, { "completion_length": 702.75, "epoch": 2.087860310421286, "grad_norm": 0.0, "kl": 0.3148515224456787, "learning_rate": 3.1664786240026136e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7533 }, { "completion_length": 723.75, "epoch": 2.0881374722838135, "grad_norm": 0.0, "kl": 0.1434350609779358, "learning_rate": 3.1660566660088475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7534 }, { "completion_length": 459.0, "epoch": 2.0884146341463414, "grad_norm": 0.0, "kl": 0.27961188554763794, "learning_rate": 3.165634687589894e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7535 }, { "completion_length": 637.25, "epoch": 2.0886917960088693, "grad_norm": 0.0, "kl": 0.16676220297813416, "learning_rate": 3.1652126887586943e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7536 }, { "completion_length": 516.25, "epoch": 2.088968957871397, "grad_norm": 0.0, "kl": 0.18925239145755768, "learning_rate": 3.164790669528188e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7537 }, { "completion_length": 594.5, "epoch": 2.0892461197339247, "grad_norm": 0.0, "kl": 0.21181105077266693, "learning_rate": 3.1643686299113174e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7538 }, { "completion_length": 705.0, "epoch": 2.089523281596452, "grad_norm": 0.0, "kl": 0.13102421164512634, "learning_rate": 3.163946569921025e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7539 }, { "completion_length": 919.5, "epoch": 2.08980044345898, "grad_norm": 0.23748905956745148, "kl": 0.14637939631938934, "learning_rate": 3.1635244895702527e-06, "loss": -0.0, "reward": 1.09375, "reward_std": 1.3125, "rewards/confident_score_func": -0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 7540 }, { "completion_length": 569.5, "epoch": 2.0900776053215075, "grad_norm": 0.0, "kl": 0.18078063428401947, "learning_rate": 3.1631023888719442e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7541 }, { "completion_length": 623.25, "epoch": 2.0903547671840355, "grad_norm": 0.0, "kl": 0.1606004238128662, "learning_rate": 3.1626802678390445e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7542 }, { "completion_length": 574.5, "epoch": 2.0906319290465634, "grad_norm": 0.0, "kl": 0.21134918928146362, "learning_rate": 3.1622581264844975e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7543 }, { "completion_length": 634.75, "epoch": 2.090909090909091, "grad_norm": 0.0, "kl": 287789.4375, "learning_rate": 3.1618359648212492e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7544 }, { "completion_length": 620.75, "epoch": 2.0911862527716187, "grad_norm": 0.3524526357650757, "kl": 0.18322274088859558, "learning_rate": 3.161413782862244e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7545 }, { "completion_length": 615.0, "epoch": 2.091463414634146, "grad_norm": 0.0, "kl": 0.21753862500190735, "learning_rate": 3.1609915806204307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7546 }, { "completion_length": 507.25, "epoch": 2.091740576496674, "grad_norm": 0.0, "kl": 0.1841832548379898, "learning_rate": 3.1605693581087547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7547 }, { "completion_length": 570.75, "epoch": 2.0920177383592016, "grad_norm": 0.0, "kl": 0.2046549916267395, "learning_rate": 3.160147115340164e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7548 }, { "completion_length": 702.0, "epoch": 2.0922949002217295, "grad_norm": 0.0, "kl": 0.13875384628772736, "learning_rate": 3.1597248523276073e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7549 }, { "completion_length": 540.25, "epoch": 2.0925720620842574, "grad_norm": 0.0, "kl": 460204.90625, "learning_rate": 3.1593025690840344e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7550 }, { "completion_length": 569.25, "epoch": 2.092849223946785, "grad_norm": 0.0, "kl": 0.16322070360183716, "learning_rate": 3.1588802656223946e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7551 }, { "completion_length": 537.5, "epoch": 2.0931263858093128, "grad_norm": 0.0, "kl": 0.193756103515625, "learning_rate": 3.158457941955637e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7552 }, { "completion_length": 589.75, "epoch": 2.0934035476718402, "grad_norm": 0.3642043471336365, "kl": 0.14855718612670898, "learning_rate": 3.158035598096715e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7553 }, { "completion_length": 629.0, "epoch": 2.093680709534368, "grad_norm": 0.0, "kl": 0.19822899997234344, "learning_rate": 3.157613234058577e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7554 }, { "completion_length": 522.0, "epoch": 2.0939578713968956, "grad_norm": 0.0, "kl": 0.227504163980484, "learning_rate": 3.1571908498541774e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7555 }, { "completion_length": 530.25, "epoch": 2.0942350332594235, "grad_norm": 0.0, "kl": 0.28620219230651855, "learning_rate": 3.1567684454964674e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7556 }, { "completion_length": 664.0, "epoch": 2.0945121951219514, "grad_norm": 0.0, "kl": 0.14831764996051788, "learning_rate": 3.1563460209984022e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7557 }, { "completion_length": 556.75, "epoch": 2.094789356984479, "grad_norm": 0.0, "kl": 0.2271127998828888, "learning_rate": 3.1559235763729347e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7558 }, { "completion_length": 527.0, "epoch": 2.095066518847007, "grad_norm": 0.0, "kl": 0.18256112933158875, "learning_rate": 3.15550111163302e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7559 }, { "completion_length": 535.25, "epoch": 2.0953436807095343, "grad_norm": 0.0, "kl": 0.17515507340431213, "learning_rate": 3.1550786267916123e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7560 }, { "completion_length": 572.25, "epoch": 2.095620842572062, "grad_norm": 0.0, "kl": 0.18339386582374573, "learning_rate": 3.1546561218616684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7561 }, { "completion_length": 570.25, "epoch": 2.0958980044345896, "grad_norm": 0.9273380041122437, "kl": 2107813.25, "learning_rate": 3.154233596856145e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7562 }, { "completion_length": 534.5, "epoch": 2.0961751662971175, "grad_norm": 0.0, "kl": 0.1880822777748108, "learning_rate": 3.1538110517879983e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7563 }, { "completion_length": 574.75, "epoch": 2.0964523281596454, "grad_norm": 0.0, "kl": 0.1946103870868683, "learning_rate": 3.1533884866701857e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7564 }, { "completion_length": 703.0, "epoch": 2.096729490022173, "grad_norm": 0.0, "kl": 0.17416797578334808, "learning_rate": 3.1529659015156673e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7565 }, { "completion_length": 580.0, "epoch": 2.097006651884701, "grad_norm": 0.34957563877105713, "kl": 0.16573074460029602, "learning_rate": 3.152543296337401e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7566 }, { "completion_length": 572.25, "epoch": 2.0972838137472283, "grad_norm": 0.0, "kl": 0.19289138913154602, "learning_rate": 3.152120671148346e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7567 }, { "completion_length": 601.25, "epoch": 2.097560975609756, "grad_norm": 0.0, "kl": 0.2690831422805786, "learning_rate": 3.1516980259614626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7568 }, { "completion_length": 661.0, "epoch": 2.0978381374722836, "grad_norm": 0.0, "kl": 0.1498289853334427, "learning_rate": 3.1512753607897123e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7569 }, { "completion_length": 565.25, "epoch": 2.0981152993348116, "grad_norm": 0.0, "kl": 0.16425910592079163, "learning_rate": 3.1508526756460555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7570 }, { "completion_length": 543.25, "epoch": 2.0983924611973395, "grad_norm": 0.0, "kl": 0.17535674571990967, "learning_rate": 3.1504299705434542e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7571 }, { "completion_length": 625.75, "epoch": 2.098669623059867, "grad_norm": 0.42266035079956055, "kl": 1070857977856.0, "learning_rate": 3.150007245494872e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7572 }, { "completion_length": 579.75, "epoch": 2.098946784922395, "grad_norm": 0.0, "kl": 0.20415213704109192, "learning_rate": 3.149584500513272e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7573 }, { "completion_length": 557.5, "epoch": 2.0992239467849223, "grad_norm": 0.0, "kl": 0.1983793079853058, "learning_rate": 3.1491617356116167e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7574 }, { "completion_length": 612.5, "epoch": 2.09950110864745, "grad_norm": 0.39566782116889954, "kl": 3301216354304.0, "learning_rate": 3.148738950802872e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7575 }, { "completion_length": 631.25, "epoch": 2.0997782705099777, "grad_norm": 0.0, "kl": 0.21488520503044128, "learning_rate": 3.1483161461000023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7576 }, { "completion_length": 539.75, "epoch": 2.1000554323725056, "grad_norm": 0.0, "kl": 0.18850292265415192, "learning_rate": 3.147893321515973e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7577 }, { "completion_length": 690.75, "epoch": 2.100332594235033, "grad_norm": 0.32166168093681335, "kl": 0.15310274064540863, "learning_rate": 3.1474704770637505e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7578 }, { "completion_length": 605.75, "epoch": 2.100609756097561, "grad_norm": 0.0, "kl": 0.1969272643327713, "learning_rate": 3.147047612756302e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7579 }, { "completion_length": 578.25, "epoch": 2.100886917960089, "grad_norm": 0.0, "kl": 0.1626117080450058, "learning_rate": 3.1466247286065954e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7580 }, { "completion_length": 580.5, "epoch": 2.1011640798226163, "grad_norm": 0.0, "kl": 0.19171766936779022, "learning_rate": 3.1462018246275975e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7581 }, { "completion_length": 579.25, "epoch": 2.1014412416851442, "grad_norm": 0.0, "kl": 0.17568261921405792, "learning_rate": 3.145778900832278e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7582 }, { "completion_length": 499.25, "epoch": 2.1017184035476717, "grad_norm": 0.0, "kl": 0.4077596664428711, "learning_rate": 3.145355957233607e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7583 }, { "completion_length": 543.75, "epoch": 2.1019955654101996, "grad_norm": 0.0, "kl": 0.164290651679039, "learning_rate": 3.1449329938445527e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7584 }, { "completion_length": 457.0, "epoch": 2.102272727272727, "grad_norm": 0.40514662861824036, "kl": 0.19345611333847046, "learning_rate": 3.144510010678086e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7585 }, { "completion_length": 606.25, "epoch": 2.102549889135255, "grad_norm": 0.0, "kl": 0.24498289823532104, "learning_rate": 3.144087007747178e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7586 }, { "completion_length": 599.25, "epoch": 2.102827050997783, "grad_norm": 1.5516037940979004, "kl": 12092607299584.0, "learning_rate": 3.143663985064802e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7587 }, { "completion_length": 711.0, "epoch": 2.1031042128603104, "grad_norm": 0.0, "kl": 0.17056524753570557, "learning_rate": 3.143240942643928e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7588 }, { "completion_length": 571.25, "epoch": 2.1033813747228383, "grad_norm": 0.0, "kl": 0.14755572378635406, "learning_rate": 3.1428178804975313e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7589 }, { "completion_length": 598.0, "epoch": 2.1036585365853657, "grad_norm": 0.0, "kl": 0.16917765140533447, "learning_rate": 3.142394798638584e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7590 }, { "completion_length": 636.25, "epoch": 2.1039356984478936, "grad_norm": 0.0, "kl": 0.1909080445766449, "learning_rate": 3.14197169708006e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7591 }, { "completion_length": 546.75, "epoch": 2.104212860310421, "grad_norm": 0.0, "kl": 0.19034472107887268, "learning_rate": 3.1415485758349344e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7592 }, { "completion_length": 550.25, "epoch": 2.104490022172949, "grad_norm": 0.0, "kl": 0.33661913871765137, "learning_rate": 3.141125434916183e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7593 }, { "completion_length": 517.25, "epoch": 2.104767184035477, "grad_norm": 0.0, "kl": 0.16230368614196777, "learning_rate": 3.1407022743367814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7594 }, { "completion_length": 659.0, "epoch": 2.1050443458980044, "grad_norm": 0.0, "kl": 0.20957022905349731, "learning_rate": 3.1402790941097073e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7595 }, { "completion_length": 567.5, "epoch": 2.1053215077605323, "grad_norm": 0.0, "kl": 0.1694435179233551, "learning_rate": 3.1398558942479355e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7596 }, { "completion_length": 651.5, "epoch": 2.1055986696230597, "grad_norm": 0.0, "kl": 0.16890136897563934, "learning_rate": 3.1394326747644457e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7597 }, { "completion_length": 620.0, "epoch": 2.1058758314855877, "grad_norm": 0.0, "kl": 0.22376979887485504, "learning_rate": 3.1390094356722163e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7598 }, { "completion_length": 747.75, "epoch": 2.106152993348115, "grad_norm": 0.0, "kl": 0.13391955196857452, "learning_rate": 3.1385861769842253e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7599 }, { "completion_length": 598.0, "epoch": 2.106430155210643, "grad_norm": 0.0, "kl": 0.16762153804302216, "learning_rate": 3.1381628987134526e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7600 }, { "completion_length": 611.0, "epoch": 2.106707317073171, "grad_norm": 0.0, "kl": 0.1331876516342163, "learning_rate": 3.1377396008728777e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7601 }, { "completion_length": 561.75, "epoch": 2.1069844789356984, "grad_norm": 0.0, "kl": 0.15507574379444122, "learning_rate": 3.1373162834754835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7602 }, { "completion_length": 475.0, "epoch": 2.1072616407982263, "grad_norm": 0.0, "kl": 19732849754112.0, "learning_rate": 3.136892946534249e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7603 }, { "completion_length": 633.25, "epoch": 2.1075388026607538, "grad_norm": 0.0, "kl": 1.5673338174819946, "learning_rate": 3.136469590062158e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7604 }, { "completion_length": 722.75, "epoch": 2.1078159645232817, "grad_norm": 0.0, "kl": 0.13044703006744385, "learning_rate": 3.1360462140721925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7605 }, { "completion_length": 590.0, "epoch": 2.108093126385809, "grad_norm": 0.0, "kl": 0.17719994485378265, "learning_rate": 3.135622818577335e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7606 }, { "completion_length": 523.5, "epoch": 2.108370288248337, "grad_norm": 0.0, "kl": 0.18091873824596405, "learning_rate": 3.1351994035905697e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7607 }, { "completion_length": 543.75, "epoch": 2.1086474501108645, "grad_norm": 0.0, "kl": 0.17319104075431824, "learning_rate": 3.1347759691248815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7608 }, { "completion_length": 590.5, "epoch": 2.1089246119733924, "grad_norm": 0.0, "kl": 0.2269456386566162, "learning_rate": 3.134352515193254e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7609 }, { "completion_length": 588.75, "epoch": 2.1092017738359203, "grad_norm": 0.0, "kl": 0.20807798206806183, "learning_rate": 3.1339290418086755e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7610 }, { "completion_length": 553.75, "epoch": 2.109478935698448, "grad_norm": 0.0, "kl": 95696560586752.0, "learning_rate": 3.1335055489841286e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7611 }, { "completion_length": 591.75, "epoch": 2.1097560975609757, "grad_norm": 0.0, "kl": 0.5178698897361755, "learning_rate": 3.133082036732602e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7612 }, { "completion_length": 511.75, "epoch": 2.110033259423503, "grad_norm": 0.0, "kl": 0.24469280242919922, "learning_rate": 3.1326585050670844e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7613 }, { "completion_length": 636.25, "epoch": 2.110310421286031, "grad_norm": 0.0, "kl": 0.351599782705307, "learning_rate": 3.132234954000561e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7614 }, { "completion_length": 557.5, "epoch": 2.1105875831485585, "grad_norm": 0.0, "kl": 0.42255741357803345, "learning_rate": 3.1318113835460217e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7615 }, { "completion_length": 482.5, "epoch": 2.1108647450110865, "grad_norm": 0.0, "kl": 3794132008960.0, "learning_rate": 3.131387793716455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7616 }, { "completion_length": 628.75, "epoch": 2.1111419068736144, "grad_norm": 0.0, "kl": 0.41084811091423035, "learning_rate": 3.1309641845248524e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7617 }, { "completion_length": 604.0, "epoch": 2.111419068736142, "grad_norm": 0.0, "kl": 185607213023232.0, "learning_rate": 3.1305405559842016e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7618 }, { "completion_length": 569.75, "epoch": 2.1116962305986697, "grad_norm": 0.0, "kl": 0.17059622704982758, "learning_rate": 3.1301169081074968e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7619 }, { "completion_length": 568.25, "epoch": 2.111973392461197, "grad_norm": 0.0, "kl": 0.20120786130428314, "learning_rate": 3.129693240907726e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7620 }, { "completion_length": 535.5, "epoch": 2.112250554323725, "grad_norm": 0.0, "kl": 0.18625497817993164, "learning_rate": 3.129269554397884e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7621 }, { "completion_length": 594.25, "epoch": 2.1125277161862526, "grad_norm": 0.0, "kl": 0.6054950952529907, "learning_rate": 3.1288458485909616e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7622 }, { "completion_length": 559.0, "epoch": 2.1128048780487805, "grad_norm": 0.0, "kl": 0.32010889053344727, "learning_rate": 3.1284221234999534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7623 }, { "completion_length": 668.0, "epoch": 2.1130820399113084, "grad_norm": 0.0, "kl": 0.5691983699798584, "learning_rate": 3.1279983791378526e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7624 }, { "completion_length": 463.5, "epoch": 2.113359201773836, "grad_norm": 0.0, "kl": 0.16869230568408966, "learning_rate": 3.1275746155176547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7625 }, { "completion_length": 647.75, "epoch": 2.1136363636363638, "grad_norm": 0.0, "kl": 0.2205493450164795, "learning_rate": 3.1271508326523533e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7626 }, { "completion_length": 568.75, "epoch": 2.113913525498891, "grad_norm": 0.0, "kl": 0.17259085178375244, "learning_rate": 3.126727030554945e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7627 }, { "completion_length": 577.75, "epoch": 2.114190687361419, "grad_norm": 0.0, "kl": 0.1736401915550232, "learning_rate": 3.1263032092384255e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7628 }, { "completion_length": 504.0, "epoch": 2.1144678492239466, "grad_norm": 0.0, "kl": 0.21589769423007965, "learning_rate": 3.1258793687157917e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7629 }, { "completion_length": 505.25, "epoch": 2.1147450110864745, "grad_norm": 0.0, "kl": 0.20097050070762634, "learning_rate": 3.1254555090000418e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7630 }, { "completion_length": 614.25, "epoch": 2.1150221729490024, "grad_norm": 0.0, "kl": 0.18314364552497864, "learning_rate": 3.1250316301041727e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7631 }, { "completion_length": 585.5, "epoch": 2.11529933481153, "grad_norm": 0.0, "kl": 0.2149858921766281, "learning_rate": 3.1246077320411842e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7632 }, { "completion_length": 539.75, "epoch": 2.115576496674058, "grad_norm": 0.0, "kl": 0.20004145801067352, "learning_rate": 3.124183814824075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7633 }, { "completion_length": 489.5, "epoch": 2.1158536585365852, "grad_norm": 0.0, "kl": 0.18712009489536285, "learning_rate": 3.1237598784658444e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7634 }, { "completion_length": 536.5, "epoch": 2.116130820399113, "grad_norm": 0.0, "kl": 0.24671469628810883, "learning_rate": 3.123335922979493e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7635 }, { "completion_length": 694.5, "epoch": 2.1164079822616406, "grad_norm": 0.0, "kl": 1068.0550537109375, "learning_rate": 3.1229119483780223e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7636 }, { "completion_length": 612.5, "epoch": 2.1166851441241685, "grad_norm": 0.0, "kl": 0.252939373254776, "learning_rate": 3.122487954674433e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7637 }, { "completion_length": 543.75, "epoch": 2.1169623059866964, "grad_norm": 0.0, "kl": 0.44926387071609497, "learning_rate": 3.122063941881727e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7638 }, { "completion_length": 569.75, "epoch": 2.117239467849224, "grad_norm": 0.0, "kl": 0.5294837355613708, "learning_rate": 3.1216399100129084e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7639 }, { "completion_length": 515.0, "epoch": 2.117516629711752, "grad_norm": 0.0, "kl": 0.2007139027118683, "learning_rate": 3.12121585908098e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7640 }, { "completion_length": 511.75, "epoch": 2.1177937915742793, "grad_norm": 0.0, "kl": 0.20386868715286255, "learning_rate": 3.120791789098945e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7641 }, { "completion_length": 688.0, "epoch": 2.118070953436807, "grad_norm": 0.0, "kl": 0.1782670021057129, "learning_rate": 3.1203677000798082e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7642 }, { "completion_length": 527.0, "epoch": 2.1183481152993346, "grad_norm": 0.8010120391845703, "kl": 11187905363968.0, "learning_rate": 3.119943592036575e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7643 }, { "completion_length": 575.0, "epoch": 2.1186252771618626, "grad_norm": 0.0, "kl": 0.20850616693496704, "learning_rate": 3.11951946498225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7644 }, { "completion_length": 587.0, "epoch": 2.1189024390243905, "grad_norm": 0.0, "kl": 0.7914543747901917, "learning_rate": 3.1190953189298407e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7645 }, { "completion_length": 650.75, "epoch": 2.119179600886918, "grad_norm": 0.0, "kl": 0.2178603708744049, "learning_rate": 3.1186711538923536e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7646 }, { "completion_length": 518.25, "epoch": 2.119456762749446, "grad_norm": 0.0, "kl": 739289096781824.0, "learning_rate": 3.1182469698827948e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7647 }, { "completion_length": 427.75, "epoch": 2.1197339246119733, "grad_norm": 0.0, "kl": 0.16993148624897003, "learning_rate": 3.117822766914174e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7648 }, { "completion_length": 545.25, "epoch": 2.120011086474501, "grad_norm": 0.0, "kl": 0.2923688292503357, "learning_rate": 3.117398544999499e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7649 }, { "completion_length": 577.0, "epoch": 2.1202882483370287, "grad_norm": 0.0, "kl": 0.16976328194141388, "learning_rate": 3.1169743041517793e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7650 }, { "completion_length": 562.25, "epoch": 2.1205654101995566, "grad_norm": 0.0, "kl": 0.1414722055196762, "learning_rate": 3.116550044384024e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7651 }, { "completion_length": 554.5, "epoch": 2.1208425720620845, "grad_norm": 0.0, "kl": 1.4868640899658203, "learning_rate": 3.1161257657092434e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7652 }, { "completion_length": 562.25, "epoch": 2.121119733924612, "grad_norm": 2.184762477874756, "kl": 48103124992.0, "learning_rate": 3.1157014681404486e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7653 }, { "completion_length": 567.0, "epoch": 2.12139689578714, "grad_norm": 0.0, "kl": 0.20195095241069794, "learning_rate": 3.115277151690651e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7654 }, { "completion_length": 642.75, "epoch": 2.1216740576496673, "grad_norm": 0.7443140745162964, "kl": 20061683187712.0, "learning_rate": 3.114852816372863e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7655 }, { "completion_length": 599.25, "epoch": 2.1219512195121952, "grad_norm": 0.0, "kl": 2942150.75, "learning_rate": 3.1144284622000966e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7656 }, { "completion_length": 569.25, "epoch": 2.1222283813747227, "grad_norm": 0.0, "kl": 0.263386070728302, "learning_rate": 3.114004089185365e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7657 }, { "completion_length": 549.0, "epoch": 2.1225055432372506, "grad_norm": 0.0, "kl": 134564043292672.0, "learning_rate": 3.1135796973416826e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7658 }, { "completion_length": 521.75, "epoch": 2.122782705099778, "grad_norm": 0.0, "kl": 0.16280622780323029, "learning_rate": 3.113155286682063e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7659 }, { "completion_length": 633.75, "epoch": 2.123059866962306, "grad_norm": 0.0, "kl": 0.308867484331131, "learning_rate": 3.1127308572195216e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7660 }, { "completion_length": 462.0, "epoch": 2.123337028824834, "grad_norm": 0.0, "kl": 0.20450836420059204, "learning_rate": 3.1123064089670734e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7661 }, { "completion_length": 661.25, "epoch": 2.1236141906873613, "grad_norm": 0.0, "kl": 0.6063534021377563, "learning_rate": 3.1118819419377348e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7662 }, { "completion_length": 636.0, "epoch": 2.1238913525498893, "grad_norm": 0.0, "kl": 0.16330359876155853, "learning_rate": 3.1114574561445225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7663 }, { "completion_length": 598.0, "epoch": 2.1241685144124167, "grad_norm": 0.0, "kl": 0.21938152611255646, "learning_rate": 3.1110329516004546e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7664 }, { "completion_length": 554.25, "epoch": 2.1244456762749446, "grad_norm": 0.0, "kl": 0.6524734497070312, "learning_rate": 3.1106084283185466e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7665 }, { "completion_length": 525.5, "epoch": 2.124722838137472, "grad_norm": 0.0, "kl": 324747510415360.0, "learning_rate": 3.1101838863118188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7666 }, { "completion_length": 562.5, "epoch": 2.125, "grad_norm": 1.6036430597305298, "kl": 0.22466619312763214, "learning_rate": 3.109759325593289e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7667 }, { "completion_length": 600.75, "epoch": 2.125277161862528, "grad_norm": 0.0, "kl": 0.17303705215454102, "learning_rate": 3.1093347461759775e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7668 }, { "completion_length": 591.0, "epoch": 2.1255543237250554, "grad_norm": 0.0, "kl": 0.23542168736457825, "learning_rate": 3.108910148072904e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7669 }, { "completion_length": 661.5, "epoch": 2.1258314855875833, "grad_norm": 0.0, "kl": 0.8144401907920837, "learning_rate": 3.1084855312970897e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7670 }, { "completion_length": 607.75, "epoch": 2.1261086474501107, "grad_norm": 0.0, "kl": 0.1664314568042755, "learning_rate": 3.108060895861555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7671 }, { "completion_length": 635.0, "epoch": 2.1263858093126387, "grad_norm": 0.0, "kl": 0.1942306011915207, "learning_rate": 3.1076362417793226e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7672 }, { "completion_length": 503.75, "epoch": 2.126662971175166, "grad_norm": 0.0, "kl": 0.22768007218837738, "learning_rate": 3.1072115690634136e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7673 }, { "completion_length": 597.5, "epoch": 2.126940133037694, "grad_norm": 0.0, "kl": 0.16795557737350464, "learning_rate": 3.1067868777268527e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7674 }, { "completion_length": 743.75, "epoch": 2.127217294900222, "grad_norm": 0.0, "kl": 0.173252135515213, "learning_rate": 3.106362167782661e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7675 }, { "completion_length": 500.5, "epoch": 2.1274944567627494, "grad_norm": 0.0, "kl": 0.15401926636695862, "learning_rate": 3.105937439243865e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7676 }, { "completion_length": 558.25, "epoch": 2.1277716186252773, "grad_norm": 0.0, "kl": 0.1415974646806717, "learning_rate": 3.105512692123488e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7677 }, { "completion_length": 520.75, "epoch": 2.1280487804878048, "grad_norm": 0.0, "kl": 0.3975520431995392, "learning_rate": 3.1050879264345565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7678 }, { "completion_length": 583.75, "epoch": 2.1283259423503327, "grad_norm": 0.0, "kl": 0.20698489248752594, "learning_rate": 3.104663142190094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7679 }, { "completion_length": 555.0, "epoch": 2.12860310421286, "grad_norm": 0.0, "kl": 0.21980921924114227, "learning_rate": 3.1042383394031294e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7680 }, { "completion_length": 571.25, "epoch": 2.128880266075388, "grad_norm": 0.0, "kl": 0.16712410748004913, "learning_rate": 3.103813518086688e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7681 }, { "completion_length": 515.0, "epoch": 2.1291574279379155, "grad_norm": 0.0, "kl": 0.19783662259578705, "learning_rate": 3.103388678253798e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7682 }, { "completion_length": 528.5, "epoch": 2.1294345898004434, "grad_norm": 0.0, "kl": 0.2939407229423523, "learning_rate": 3.1029638199174865e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7683 }, { "completion_length": 582.75, "epoch": 2.1297117516629713, "grad_norm": 0.0, "kl": 0.20805147290229797, "learning_rate": 3.1025389430907836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7684 }, { "completion_length": 502.25, "epoch": 2.129988913525499, "grad_norm": 0.4800618290901184, "kl": 135696857694208.0, "learning_rate": 3.1021140477867184e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7685 }, { "completion_length": 734.75, "epoch": 2.1302660753880267, "grad_norm": 0.30959436297416687, "kl": 219074235727872.0, "learning_rate": 3.1016891340183193e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7686 }, { "completion_length": 637.5, "epoch": 2.130543237250554, "grad_norm": 0.6268038153648376, "kl": 185984935264256.0, "learning_rate": 3.1012642017986177e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7687 }, { "completion_length": 522.75, "epoch": 2.130820399113082, "grad_norm": 0.0, "kl": 0.15921980142593384, "learning_rate": 3.1008392511406437e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7688 }, { "completion_length": 467.25, "epoch": 2.1310975609756095, "grad_norm": 0.0, "kl": 0.25376880168914795, "learning_rate": 3.10041428205743e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7689 }, { "completion_length": 513.25, "epoch": 2.1313747228381374, "grad_norm": 0.0, "kl": 0.45635631680488586, "learning_rate": 3.099989294562007e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7690 }, { "completion_length": 611.5, "epoch": 2.1316518847006654, "grad_norm": 0.0, "kl": 0.16274909675121307, "learning_rate": 3.0995642886674094e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7691 }, { "completion_length": 536.75, "epoch": 2.131929046563193, "grad_norm": 0.0, "kl": 0.2521215081214905, "learning_rate": 3.0991392643866687e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7692 }, { "completion_length": 580.75, "epoch": 2.1322062084257207, "grad_norm": 0.0, "kl": 0.21736860275268555, "learning_rate": 3.0987142217328193e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7693 }, { "completion_length": 556.5, "epoch": 2.132483370288248, "grad_norm": 0.0, "kl": 28381087268864.0, "learning_rate": 3.0982891607188948e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7694 }, { "completion_length": 538.75, "epoch": 2.132760532150776, "grad_norm": 0.0, "kl": 0.18833157420158386, "learning_rate": 3.0978640813579308e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7695 }, { "completion_length": 541.5, "epoch": 2.1330376940133036, "grad_norm": 0.0, "kl": 0.327021062374115, "learning_rate": 3.0974389836629628e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7696 }, { "completion_length": 535.5, "epoch": 2.1333148558758315, "grad_norm": 0.0, "kl": 0.29060834646224976, "learning_rate": 3.097013867647026e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7697 }, { "completion_length": 591.0, "epoch": 2.1335920177383594, "grad_norm": 0.0, "kl": 540701586620416.0, "learning_rate": 3.0965887333231577e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7698 }, { "completion_length": 533.0, "epoch": 2.133869179600887, "grad_norm": 1.391745924949646, "kl": 8690115584.0, "learning_rate": 3.096163580704394e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7699 }, { "completion_length": 633.5, "epoch": 2.1341463414634148, "grad_norm": 0.0, "kl": 203670872391680.0, "learning_rate": 3.095738409803774e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7700 }, { "completion_length": 570.75, "epoch": 2.134423503325942, "grad_norm": 0.0, "kl": 0.18607011437416077, "learning_rate": 3.095313220634335e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7701 }, { "completion_length": 514.0, "epoch": 2.13470066518847, "grad_norm": 0.0, "kl": 0.335586816072464, "learning_rate": 3.094888013209116e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7702 }, { "completion_length": 505.0, "epoch": 2.1349778270509976, "grad_norm": 0.0, "kl": 0.4929989278316498, "learning_rate": 3.0944627875411566e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7703 }, { "completion_length": 581.25, "epoch": 2.1352549889135255, "grad_norm": 0.0, "kl": 0.19357429444789886, "learning_rate": 3.0940375436434967e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7704 }, { "completion_length": 733.0, "epoch": 2.1355321507760534, "grad_norm": 0.0, "kl": 0.1461668759584427, "learning_rate": 3.093612281529175e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7705 }, { "completion_length": 549.5, "epoch": 2.135809312638581, "grad_norm": 0.0, "kl": 7271244800.0, "learning_rate": 3.0931870012112352e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7706 }, { "completion_length": 526.5, "epoch": 2.136086474501109, "grad_norm": 1.7692638635635376, "kl": 0.21878352761268616, "learning_rate": 3.092761702702717e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7707 }, { "completion_length": 606.0, "epoch": 2.1363636363636362, "grad_norm": 0.0, "kl": 0.18990008533000946, "learning_rate": 3.0923363860166638e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7708 }, { "completion_length": 554.0, "epoch": 2.136640798226164, "grad_norm": 0.0, "kl": 0.3816705048084259, "learning_rate": 3.091911051166117e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7709 }, { "completion_length": 487.75, "epoch": 2.1369179600886916, "grad_norm": 0.0, "kl": 0.20269224047660828, "learning_rate": 3.091485698164121e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7710 }, { "completion_length": 592.25, "epoch": 2.1371951219512195, "grad_norm": 0.0, "kl": 0.1807202696800232, "learning_rate": 3.0910603270237196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7711 }, { "completion_length": 604.5, "epoch": 2.1374722838137474, "grad_norm": 0.38617125153541565, "kl": 300841084911616.0, "learning_rate": 3.0906349377579557e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7712 }, { "completion_length": 624.0, "epoch": 2.137749445676275, "grad_norm": 0.0, "kl": 0.191629558801651, "learning_rate": 3.0902095303798758e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7713 }, { "completion_length": 731.5, "epoch": 2.138026607538803, "grad_norm": 0.0, "kl": 0.13987798988819122, "learning_rate": 3.0897841049025246e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7714 }, { "completion_length": 541.5, "epoch": 2.1383037694013303, "grad_norm": 0.0, "kl": 0.19251853227615356, "learning_rate": 3.0893586613389494e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7715 }, { "completion_length": 548.75, "epoch": 2.138580931263858, "grad_norm": 0.0, "kl": 0.20361851155757904, "learning_rate": 3.088933199702194e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7716 }, { "completion_length": 558.5, "epoch": 2.1388580931263856, "grad_norm": 0.0, "kl": 0.21973681449890137, "learning_rate": 3.0885077200053086e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7717 }, { "completion_length": 597.75, "epoch": 2.1391352549889135, "grad_norm": 0.0, "kl": 0.17084208130836487, "learning_rate": 3.0880822222613395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7718 }, { "completion_length": 594.75, "epoch": 2.1394124168514415, "grad_norm": 0.0, "kl": 1055248547840.0, "learning_rate": 3.0876567064833348e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7719 }, { "completion_length": 647.5, "epoch": 2.139689578713969, "grad_norm": 0.0, "kl": 0.166011244058609, "learning_rate": 3.0872311726843423e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7720 }, { "completion_length": 607.75, "epoch": 2.139966740576497, "grad_norm": 0.0, "kl": 0.18463356792926788, "learning_rate": 3.0868056208774138e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7721 }, { "completion_length": 618.0, "epoch": 2.1402439024390243, "grad_norm": 0.0, "kl": 0.17075090110301971, "learning_rate": 3.0863800510755976e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7722 }, { "completion_length": 610.75, "epoch": 2.140521064301552, "grad_norm": 0.0, "kl": 0.1507202833890915, "learning_rate": 3.0859544632919454e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7723 }, { "completion_length": 590.25, "epoch": 2.1407982261640797, "grad_norm": 0.0, "kl": 0.21917442977428436, "learning_rate": 3.085528857539506e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7724 }, { "completion_length": 664.75, "epoch": 2.1410753880266076, "grad_norm": 0.0, "kl": 0.15937994420528412, "learning_rate": 3.0851032338313337e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7725 }, { "completion_length": 579.0, "epoch": 2.1413525498891355, "grad_norm": 0.0, "kl": 0.18414436280727386, "learning_rate": 3.0846775921804776e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7726 }, { "completion_length": 500.5, "epoch": 2.141629711751663, "grad_norm": 0.0, "kl": 0.1991194635629654, "learning_rate": 3.0842519325999924e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7727 }, { "completion_length": 621.5, "epoch": 2.141906873614191, "grad_norm": 0.0, "kl": 0.6428899168968201, "learning_rate": 3.0838262551029315e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7728 }, { "completion_length": 619.5, "epoch": 2.1421840354767183, "grad_norm": 0.0, "kl": 0.1649160087108612, "learning_rate": 3.083400559702347e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7729 }, { "completion_length": 502.5, "epoch": 2.1424611973392462, "grad_norm": 0.0, "kl": 92288755695616.0, "learning_rate": 3.082974846411295e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7730 }, { "completion_length": 567.75, "epoch": 2.1427383592017737, "grad_norm": 0.0, "kl": 0.1542932689189911, "learning_rate": 3.082549115242829e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7731 }, { "completion_length": 521.25, "epoch": 2.1430155210643016, "grad_norm": 0.44199827313423157, "kl": 0.18127524852752686, "learning_rate": 3.0821233662100065e-06, "loss": 0.0, "reward": 5.71875, "reward_std": 0.0625, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 7732 }, { "completion_length": 541.5, "epoch": 2.143292682926829, "grad_norm": 0.46273887157440186, "kl": 54305780400128.0, "learning_rate": 3.0816975993258806e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7733 }, { "completion_length": 553.0, "epoch": 2.143569844789357, "grad_norm": 0.0, "kl": 0.14868348836898804, "learning_rate": 3.0812718146035098e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7734 }, { "completion_length": 532.75, "epoch": 2.143847006651885, "grad_norm": 0.0, "kl": 0.20567995309829712, "learning_rate": 3.0808460120559493e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7735 }, { "completion_length": 623.0, "epoch": 2.1441241685144123, "grad_norm": 0.3579680323600769, "kl": 0.17742709815502167, "learning_rate": 3.080420191696259e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7736 }, { "completion_length": 601.0, "epoch": 2.1444013303769403, "grad_norm": 0.0, "kl": 0.620034396648407, "learning_rate": 3.0799943535374952e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7737 }, { "completion_length": 548.0, "epoch": 2.1446784922394677, "grad_norm": 0.0, "kl": 0.19984231889247894, "learning_rate": 3.079568497592718e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7738 }, { "completion_length": 550.5, "epoch": 2.1449556541019956, "grad_norm": 0.0, "kl": 3.0018961429595947, "learning_rate": 3.0791426238749855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7739 }, { "completion_length": 733.75, "epoch": 2.145232815964523, "grad_norm": 1.6207627058029175, "kl": 0.11354148387908936, "learning_rate": 3.0787167323973584e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7740 }, { "completion_length": 662.0, "epoch": 2.145509977827051, "grad_norm": 0.0, "kl": 0.1566808819770813, "learning_rate": 3.078290823172896e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7741 }, { "completion_length": 566.75, "epoch": 2.145787139689579, "grad_norm": 0.0, "kl": 0.16366046667099, "learning_rate": 3.0778648962146597e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7742 }, { "completion_length": 673.0, "epoch": 2.1460643015521064, "grad_norm": 7.410759449005127, "kl": 1.3608884811401367, "learning_rate": 3.0774389515357106e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7743 }, { "completion_length": 659.5, "epoch": 2.1463414634146343, "grad_norm": 0.0, "kl": 0.2913607656955719, "learning_rate": 3.077012989149111e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7744 }, { "completion_length": 574.5, "epoch": 2.1466186252771617, "grad_norm": 0.0, "kl": 0.15982209146022797, "learning_rate": 3.076587009067924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7745 }, { "completion_length": 575.5, "epoch": 2.1468957871396896, "grad_norm": 0.0, "kl": 0.1678302139043808, "learning_rate": 3.076161011305211e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7746 }, { "completion_length": 501.5, "epoch": 2.147172949002217, "grad_norm": 0.0, "kl": 0.1596013754606247, "learning_rate": 3.075734995874038e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7747 }, { "completion_length": 631.0, "epoch": 2.147450110864745, "grad_norm": 0.0, "kl": 0.16275617480278015, "learning_rate": 3.0753089627874668e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7748 }, { "completion_length": 609.5, "epoch": 2.147727272727273, "grad_norm": 0.0, "kl": 0.1750122755765915, "learning_rate": 3.074882912058563e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7749 }, { "completion_length": 540.5, "epoch": 2.1480044345898004, "grad_norm": 0.0, "kl": 0.19099311530590057, "learning_rate": 3.0744568437003906e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7750 }, { "completion_length": 690.5, "epoch": 2.1482815964523283, "grad_norm": 0.0, "kl": 0.3480807840824127, "learning_rate": 3.074030757726018e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7751 }, { "completion_length": 579.25, "epoch": 2.1485587583148558, "grad_norm": 0.0, "kl": 0.32139092683792114, "learning_rate": 3.0736046541485097e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7752 }, { "completion_length": 533.0, "epoch": 2.1488359201773837, "grad_norm": 0.0, "kl": 0.1687711775302887, "learning_rate": 3.073178532980933e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7753 }, { "completion_length": 542.25, "epoch": 2.149113082039911, "grad_norm": 0.0, "kl": 0.17060737311840057, "learning_rate": 3.0727523942363547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7754 }, { "completion_length": 539.25, "epoch": 2.149390243902439, "grad_norm": 0.0, "kl": 288762658816.0, "learning_rate": 3.072326237927843e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7755 }, { "completion_length": 628.5, "epoch": 2.1496674057649665, "grad_norm": 0.0, "kl": 0.18992112576961517, "learning_rate": 3.0719000640684665e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7756 }, { "completion_length": 579.25, "epoch": 2.1499445676274944, "grad_norm": 0.0, "kl": 0.1778009831905365, "learning_rate": 3.0714738726712938e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7757 }, { "completion_length": 595.0, "epoch": 2.1502217294900223, "grad_norm": 0.0, "kl": 0.7504960894584656, "learning_rate": 3.0710476637493947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7758 }, { "completion_length": 568.5, "epoch": 2.15049889135255, "grad_norm": 0.0, "kl": 0.20708779990673065, "learning_rate": 3.07062143731584e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7759 }, { "completion_length": 619.5, "epoch": 2.1507760532150777, "grad_norm": 0.0, "kl": 28961.740234375, "learning_rate": 3.070195193383699e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7760 }, { "completion_length": 651.75, "epoch": 2.151053215077605, "grad_norm": 0.0, "kl": 0.18122151494026184, "learning_rate": 3.069768931966043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7761 }, { "completion_length": 606.75, "epoch": 2.151330376940133, "grad_norm": 0.0, "kl": 0.1995171308517456, "learning_rate": 3.0693426530759446e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7762 }, { "completion_length": 583.25, "epoch": 2.1516075388026605, "grad_norm": 0.0, "kl": 0.2658170163631439, "learning_rate": 3.068916356726475e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7763 }, { "completion_length": 599.0, "epoch": 2.1518847006651884, "grad_norm": 0.0, "kl": 0.19533097743988037, "learning_rate": 3.0684900429307084e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7764 }, { "completion_length": 624.0, "epoch": 2.1521618625277164, "grad_norm": 0.0, "kl": 0.14740288257598877, "learning_rate": 3.068063711701715e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7765 }, { "completion_length": 540.75, "epoch": 2.152439024390244, "grad_norm": 0.0, "kl": 18277489180672.0, "learning_rate": 3.067637363052573e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7766 }, { "completion_length": 563.75, "epoch": 2.1527161862527717, "grad_norm": 0.0, "kl": 0.1994544267654419, "learning_rate": 3.067210996996353e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7767 }, { "completion_length": 471.5, "epoch": 2.152993348115299, "grad_norm": 0.0, "kl": 0.23188969492912292, "learning_rate": 3.066784613546132e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7768 }, { "completion_length": 615.0, "epoch": 2.153270509977827, "grad_norm": 0.0, "kl": 0.1534343808889389, "learning_rate": 3.066358212714984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7769 }, { "completion_length": 592.0, "epoch": 2.1535476718403546, "grad_norm": 0.0, "kl": 0.1674557775259018, "learning_rate": 3.0659317945159863e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7770 }, { "completion_length": 635.0, "epoch": 2.1538248337028825, "grad_norm": 4.938551425933838, "kl": 43.1798210144043, "learning_rate": 3.065505358962214e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7771 }, { "completion_length": 460.5, "epoch": 2.1541019955654104, "grad_norm": 0.0, "kl": 0.16853347420692444, "learning_rate": 3.0650789060667454e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7772 }, { "completion_length": 680.5, "epoch": 2.154379157427938, "grad_norm": 0.0, "kl": 0.16221335530281067, "learning_rate": 3.0646524358426568e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7773 }, { "completion_length": 502.5, "epoch": 2.1546563192904657, "grad_norm": 0.0, "kl": 0.5189216732978821, "learning_rate": 3.064225948303027e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7774 }, { "completion_length": 456.75, "epoch": 2.154933481152993, "grad_norm": 0.0, "kl": 0.37151727080345154, "learning_rate": 3.0637994434609346e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7775 }, { "completion_length": 491.75, "epoch": 2.155210643015521, "grad_norm": 0.0, "kl": 0.1771814078092575, "learning_rate": 3.0633729213294582e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7776 }, { "completion_length": 589.75, "epoch": 2.1554878048780486, "grad_norm": 3.5461480617523193, "kl": 217541687377920.0, "learning_rate": 3.0629463819216785e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7777 }, { "completion_length": 626.0, "epoch": 2.1557649667405765, "grad_norm": 0.0, "kl": 0.19624373316764832, "learning_rate": 3.062519825250675e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7778 }, { "completion_length": 495.5, "epoch": 2.1560421286031044, "grad_norm": 0.0, "kl": 0.37890246510505676, "learning_rate": 3.062093251329529e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7779 }, { "completion_length": 534.75, "epoch": 2.156319290465632, "grad_norm": 0.0, "kl": 0.24653927981853485, "learning_rate": 3.0616666601713196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7780 }, { "completion_length": 525.25, "epoch": 2.1565964523281598, "grad_norm": 0.4831497371196747, "kl": 0.22905591130256653, "learning_rate": 3.0612400517891316e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7781 }, { "completion_length": 624.5, "epoch": 2.1568736141906872, "grad_norm": 0.0, "kl": 28152980045824.0, "learning_rate": 3.0608134261960455e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7782 }, { "completion_length": 543.75, "epoch": 2.157150776053215, "grad_norm": 0.0, "kl": 0.16729837656021118, "learning_rate": 3.0603867834051447e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7783 }, { "completion_length": 589.5, "epoch": 2.1574279379157426, "grad_norm": 0.0, "kl": 0.3384447395801544, "learning_rate": 3.0599601234295124e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7784 }, { "completion_length": 578.25, "epoch": 2.1577050997782705, "grad_norm": 0.0, "kl": 0.17660777270793915, "learning_rate": 3.0595334462822323e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7785 }, { "completion_length": 485.25, "epoch": 2.1579822616407984, "grad_norm": 0.0, "kl": 0.6859368681907654, "learning_rate": 3.0591067519763894e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7786 }, { "completion_length": 690.75, "epoch": 2.158259423503326, "grad_norm": 0.43647101521492004, "kl": 166877967941632.0, "learning_rate": 3.0586800405250677e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7787 }, { "completion_length": 573.75, "epoch": 2.158536585365854, "grad_norm": 0.0, "kl": 0.3546835780143738, "learning_rate": 3.058253311941353e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7788 }, { "completion_length": 541.0, "epoch": 2.1588137472283813, "grad_norm": 0.0, "kl": 0.20606586337089539, "learning_rate": 3.057826566238332e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7789 }, { "completion_length": 680.25, "epoch": 2.159090909090909, "grad_norm": 0.0, "kl": 0.14945831894874573, "learning_rate": 3.0573998034290908e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7790 }, { "completion_length": 491.25, "epoch": 2.1593680709534366, "grad_norm": 3.8278276920318604, "kl": 1.0703353881835938, "learning_rate": 3.0569730235267166e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7791 }, { "completion_length": 505.25, "epoch": 2.1596452328159645, "grad_norm": 0.0, "kl": 0.2201107144355774, "learning_rate": 3.056546226544296e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7792 }, { "completion_length": 618.0, "epoch": 2.1599223946784925, "grad_norm": 0.0, "kl": 0.25166577100753784, "learning_rate": 3.056119412494918e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7793 }, { "completion_length": 539.0, "epoch": 2.16019955654102, "grad_norm": 0.5771236419677734, "kl": 93716597440512.0, "learning_rate": 3.0556925813916717e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7794 }, { "completion_length": 531.5, "epoch": 2.160476718403548, "grad_norm": 0.0, "kl": 0.20563454926013947, "learning_rate": 3.055265733247645e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7795 }, { "completion_length": 580.5, "epoch": 2.1607538802660753, "grad_norm": 0.40192359685897827, "kl": 0.29548031091690063, "learning_rate": 3.054838868075928e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7796 }, { "completion_length": 574.25, "epoch": 2.161031042128603, "grad_norm": 0.0, "kl": 0.20645402371883392, "learning_rate": 3.0544119858896104e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7797 }, { "completion_length": 637.5, "epoch": 2.1613082039911307, "grad_norm": 0.0, "kl": 0.5677434206008911, "learning_rate": 3.0539850867017844e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7798 }, { "completion_length": 493.75, "epoch": 2.1615853658536586, "grad_norm": 0.0, "kl": 0.17245689034461975, "learning_rate": 3.0535581705255403e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7799 }, { "completion_length": 607.0, "epoch": 2.1618625277161865, "grad_norm": 0.0, "kl": 0.14073969423770905, "learning_rate": 3.0531312373739695e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7800 }, { "completion_length": 597.0, "epoch": 2.162139689578714, "grad_norm": 0.0, "kl": 0.18993349373340607, "learning_rate": 3.052704287260165e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7801 }, { "completion_length": 589.0, "epoch": 2.162416851441242, "grad_norm": 0.0, "kl": 0.16333580017089844, "learning_rate": 3.052277320197219e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7802 }, { "completion_length": 523.5, "epoch": 2.1626940133037693, "grad_norm": 0.0, "kl": 0.3321320414543152, "learning_rate": 3.0518503361982246e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7803 }, { "completion_length": 629.75, "epoch": 2.162971175166297, "grad_norm": 0.0, "kl": 0.17109432816505432, "learning_rate": 3.051423335276277e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7804 }, { "completion_length": 559.0, "epoch": 2.1632483370288247, "grad_norm": 0.0, "kl": 0.8180283308029175, "learning_rate": 3.0509963174444684e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7805 }, { "completion_length": 530.5, "epoch": 2.1635254988913526, "grad_norm": 0.0, "kl": 0.2827916443347931, "learning_rate": 3.0505692827158955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7806 }, { "completion_length": 582.0, "epoch": 2.16380266075388, "grad_norm": 0.0, "kl": 4077.98681640625, "learning_rate": 3.0501422311036533e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7807 }, { "completion_length": 628.25, "epoch": 2.164079822616408, "grad_norm": 0.3921964168548584, "kl": 0.20153504610061646, "learning_rate": 3.0497151626208366e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7808 }, { "completion_length": 560.0, "epoch": 2.164356984478936, "grad_norm": 0.0, "kl": 0.21230487525463104, "learning_rate": 3.0492880772805433e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7809 }, { "completion_length": 634.0, "epoch": 2.1646341463414633, "grad_norm": 0.46347320079803467, "kl": 24747515576320.0, "learning_rate": 3.048860975095869e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7810 }, { "completion_length": 647.25, "epoch": 2.1649113082039912, "grad_norm": 0.0, "kl": 0.15808819234371185, "learning_rate": 3.048433856079912e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7811 }, { "completion_length": 644.25, "epoch": 2.1651884700665187, "grad_norm": 0.0, "kl": 0.15056805312633514, "learning_rate": 3.04800672024577e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7812 }, { "completion_length": 662.0, "epoch": 2.1654656319290466, "grad_norm": 0.0, "kl": 0.24268493056297302, "learning_rate": 3.0475795676065424e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7813 }, { "completion_length": 551.25, "epoch": 2.165742793791574, "grad_norm": 0.4281814992427826, "kl": 0.38260823488235474, "learning_rate": 3.0471523981753266e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7814 }, { "completion_length": 560.0, "epoch": 2.166019955654102, "grad_norm": 0.0, "kl": 0.16479890048503876, "learning_rate": 3.0467252119652233e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7815 }, { "completion_length": 702.5, "epoch": 2.16629711751663, "grad_norm": 0.0, "kl": 0.18064551055431366, "learning_rate": 3.0462980089893314e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7816 }, { "completion_length": 709.75, "epoch": 2.1665742793791574, "grad_norm": 0.0, "kl": 0.21241270005702972, "learning_rate": 3.0458707892607525e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7817 }, { "completion_length": 612.5, "epoch": 2.1668514412416853, "grad_norm": 0.0, "kl": 0.16955167055130005, "learning_rate": 3.045443552792587e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7818 }, { "completion_length": 610.5, "epoch": 2.1671286031042127, "grad_norm": 0.3293918967247009, "kl": 0.13634303212165833, "learning_rate": 3.045016299597937e-06, "loss": 0.0, "reward": 4.09375, "reward_std": 3.3125, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 7819 }, { "completion_length": 506.0, "epoch": 2.1674057649667406, "grad_norm": 0.0, "kl": 0.549140989780426, "learning_rate": 3.0445890296899045e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7820 }, { "completion_length": 554.25, "epoch": 2.167682926829268, "grad_norm": 0.0, "kl": 0.20003865659236908, "learning_rate": 3.0441617430815915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7821 }, { "completion_length": 499.0, "epoch": 2.167960088691796, "grad_norm": 0.0, "kl": 0.18075089156627655, "learning_rate": 3.043734439786102e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7822 }, { "completion_length": 576.75, "epoch": 2.168237250554324, "grad_norm": 1.3624900579452515, "kl": 23187.2265625, "learning_rate": 3.0433071198165383e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7823 }, { "completion_length": 554.25, "epoch": 2.1685144124168514, "grad_norm": 0.0, "kl": 0.20312364399433136, "learning_rate": 3.0428797831860065e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7824 }, { "completion_length": 699.75, "epoch": 2.1687915742793793, "grad_norm": 0.0, "kl": 0.15986847877502441, "learning_rate": 3.0424524299076087e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7825 }, { "completion_length": 539.75, "epoch": 2.1690687361419068, "grad_norm": 0.0, "kl": 0.29668888449668884, "learning_rate": 3.0420250599944525e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7826 }, { "completion_length": 684.0, "epoch": 2.1693458980044347, "grad_norm": 0.0, "kl": 0.14949192106723785, "learning_rate": 3.0415976734596413e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7827 }, { "completion_length": 512.5, "epoch": 2.169623059866962, "grad_norm": 0.0, "kl": 36927575162880.0, "learning_rate": 3.041170270316284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7828 }, { "completion_length": 621.0, "epoch": 2.16990022172949, "grad_norm": 0.0, "kl": 0.17210698127746582, "learning_rate": 3.0407428505774844e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7829 }, { "completion_length": 518.75, "epoch": 2.1701773835920175, "grad_norm": 0.0, "kl": 0.16061335802078247, "learning_rate": 3.0403154142563517e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7830 }, { "completion_length": 545.5, "epoch": 2.1704545454545454, "grad_norm": 0.0, "kl": 0.16890601813793182, "learning_rate": 3.0398879613659925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7831 }, { "completion_length": 583.75, "epoch": 2.1707317073170733, "grad_norm": 0.0, "kl": 1.6092609167099, "learning_rate": 3.039460491919516e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7832 }, { "completion_length": 594.0, "epoch": 2.171008869179601, "grad_norm": 2.3269567489624023, "kl": 83320436162560.0, "learning_rate": 3.0390330059300293e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7833 }, { "completion_length": 526.5, "epoch": 2.1712860310421287, "grad_norm": 0.0, "kl": 0.2035789042711258, "learning_rate": 3.038605503410644e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7834 }, { "completion_length": 726.25, "epoch": 2.171563192904656, "grad_norm": 0.0, "kl": 1335344.25, "learning_rate": 3.0381779843744675e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7835 }, { "completion_length": 588.25, "epoch": 2.171840354767184, "grad_norm": 0.0, "kl": 0.14120177924633026, "learning_rate": 3.037750448834611e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7836 }, { "completion_length": 614.5, "epoch": 2.1721175166297115, "grad_norm": 0.0, "kl": 0.18109749257564545, "learning_rate": 3.0373228968041855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7837 }, { "completion_length": 638.0, "epoch": 2.1723946784922394, "grad_norm": 0.0, "kl": 477872.25, "learning_rate": 3.036895328296302e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7838 }, { "completion_length": 912.25, "epoch": 2.1726718403547673, "grad_norm": 0.0, "kl": 0.19494035840034485, "learning_rate": 3.036467743324072e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7839 }, { "completion_length": 673.75, "epoch": 2.172949002217295, "grad_norm": 0.0, "kl": 0.14954014122486115, "learning_rate": 3.036040141900608e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7840 }, { "completion_length": 608.25, "epoch": 2.1732261640798227, "grad_norm": 0.0, "kl": 0.15439973771572113, "learning_rate": 3.0356125240390227e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7841 }, { "completion_length": 618.75, "epoch": 2.17350332594235, "grad_norm": 0.0, "kl": 0.15738216042518616, "learning_rate": 3.03518488975243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7842 }, { "completion_length": 595.75, "epoch": 2.173780487804878, "grad_norm": 0.0, "kl": 0.1551254242658615, "learning_rate": 3.034757239053942e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7843 }, { "completion_length": 617.0, "epoch": 2.1740576496674056, "grad_norm": 0.0, "kl": 0.2642778158187866, "learning_rate": 3.0343295719566747e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7844 }, { "completion_length": 609.75, "epoch": 2.1743348115299335, "grad_norm": 0.0, "kl": 0.21328671276569366, "learning_rate": 3.033901888473742e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7845 }, { "completion_length": 579.0, "epoch": 2.1746119733924614, "grad_norm": 0.0, "kl": 0.24994665384292603, "learning_rate": 3.0334741886182593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7846 }, { "completion_length": 668.25, "epoch": 2.174889135254989, "grad_norm": 0.3903276324272156, "kl": 0.16874805092811584, "learning_rate": 3.0330464724033427e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7847 }, { "completion_length": 460.25, "epoch": 2.1751662971175167, "grad_norm": 0.0, "kl": 0.17671117186546326, "learning_rate": 3.0326187398421073e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7848 }, { "completion_length": 660.5, "epoch": 2.175443458980044, "grad_norm": 0.0, "kl": 0.23550313711166382, "learning_rate": 3.032190990947672e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7849 }, { "completion_length": 639.5, "epoch": 2.175720620842572, "grad_norm": 0.0, "kl": 0.6536067724227905, "learning_rate": 3.031763225733152e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7850 }, { "completion_length": 628.25, "epoch": 2.1759977827050996, "grad_norm": 0.0, "kl": 0.15595753490924835, "learning_rate": 3.031335444211667e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7851 }, { "completion_length": 615.75, "epoch": 2.1762749445676275, "grad_norm": 0.0, "kl": 3812612608.0, "learning_rate": 3.030907646396333e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7852 }, { "completion_length": 583.0, "epoch": 2.1765521064301554, "grad_norm": 0.0, "kl": 0.15820762515068054, "learning_rate": 3.0304798323002708e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7853 }, { "completion_length": 685.0, "epoch": 2.176829268292683, "grad_norm": 0.0, "kl": 0.17922352254390717, "learning_rate": 3.030052001936598e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7854 }, { "completion_length": 513.25, "epoch": 2.1771064301552108, "grad_norm": 0.0, "kl": 0.25242188572883606, "learning_rate": 3.0296241553184363e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7855 }, { "completion_length": 625.25, "epoch": 2.1773835920177382, "grad_norm": 0.0, "kl": 0.1294374018907547, "learning_rate": 3.0291962924589045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7856 }, { "completion_length": 563.5, "epoch": 2.177660753880266, "grad_norm": 0.0, "kl": 0.17108120024204254, "learning_rate": 3.0287684133711243e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7857 }, { "completion_length": 528.0, "epoch": 2.1779379157427936, "grad_norm": 0.0, "kl": 1338918043648.0, "learning_rate": 3.028340518068216e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7858 }, { "completion_length": 511.5, "epoch": 2.1782150776053215, "grad_norm": 0.0, "kl": 0.19095446169376373, "learning_rate": 3.0279126065633015e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7859 }, { "completion_length": 746.25, "epoch": 2.1784922394678494, "grad_norm": 0.0, "kl": 0.16880322992801666, "learning_rate": 3.027484678869504e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7860 }, { "completion_length": 493.5, "epoch": 2.178769401330377, "grad_norm": 0.0, "kl": 0.2011854201555252, "learning_rate": 3.0270567349999454e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7861 }, { "completion_length": 585.5, "epoch": 2.179046563192905, "grad_norm": 0.0, "kl": 2.2577552795410156, "learning_rate": 3.026628774967749e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7862 }, { "completion_length": 661.5, "epoch": 2.1793237250554323, "grad_norm": 0.0, "kl": 0.1638886034488678, "learning_rate": 3.026200798786039e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7863 }, { "completion_length": 560.25, "epoch": 2.17960088691796, "grad_norm": 0.0, "kl": 0.17127855122089386, "learning_rate": 3.0257728064679397e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7864 }, { "completion_length": 551.25, "epoch": 2.1798780487804876, "grad_norm": 0.0, "kl": 0.17623072862625122, "learning_rate": 3.0253447980265754e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7865 }, { "completion_length": 533.5, "epoch": 2.1801552106430155, "grad_norm": 0.0, "kl": 0.7461703419685364, "learning_rate": 3.0249167734750718e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7866 }, { "completion_length": 556.0, "epoch": 2.1804323725055434, "grad_norm": 0.0, "kl": 0.17826445400714874, "learning_rate": 3.0244887328265544e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7867 }, { "completion_length": 603.75, "epoch": 2.180709534368071, "grad_norm": 0.0, "kl": 0.17181311547756195, "learning_rate": 3.0240606760941495e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7868 }, { "completion_length": 642.0, "epoch": 2.180986696230599, "grad_norm": 0.0, "kl": 0.12867368757724762, "learning_rate": 3.0236326032909824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7869 }, { "completion_length": 566.25, "epoch": 2.1812638580931263, "grad_norm": 0.0, "kl": 0.81055748462677, "learning_rate": 3.0232045144301824e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7870 }, { "completion_length": 677.25, "epoch": 2.181541019955654, "grad_norm": 0.0, "kl": 0.13174031674861908, "learning_rate": 3.0227764095248766e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7871 }, { "completion_length": 541.75, "epoch": 2.1818181818181817, "grad_norm": 0.0, "kl": 0.16512492299079895, "learning_rate": 3.0223482885881933e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7872 }, { "completion_length": 629.5, "epoch": 2.1820953436807096, "grad_norm": 0.0, "kl": 0.16256213188171387, "learning_rate": 3.0219201516332602e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7873 }, { "completion_length": 548.0, "epoch": 2.1823725055432375, "grad_norm": 0.0, "kl": 89981640.0, "learning_rate": 3.0214919986732076e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7874 }, { "completion_length": 545.0, "epoch": 2.182649667405765, "grad_norm": 0.0, "kl": 0.14887145161628723, "learning_rate": 3.0210638297211653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7875 }, { "completion_length": 503.5, "epoch": 2.182926829268293, "grad_norm": 2.271209478378296, "kl": 141398.03125, "learning_rate": 3.020635644790262e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7876 }, { "completion_length": 592.25, "epoch": 2.1832039911308203, "grad_norm": 0.0, "kl": 0.14422155916690826, "learning_rate": 3.020207443893629e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7877 }, { "completion_length": 640.5, "epoch": 2.183481152993348, "grad_norm": 0.4321296513080597, "kl": 18374088196096.0, "learning_rate": 3.019779227044398e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7878 }, { "completion_length": 629.25, "epoch": 2.1837583148558757, "grad_norm": 0.0, "kl": 0.1616785079240799, "learning_rate": 3.0193509942557016e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7879 }, { "completion_length": 619.75, "epoch": 2.1840354767184036, "grad_norm": 0.0, "kl": 0.15126219391822815, "learning_rate": 3.0189227455406694e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7880 }, { "completion_length": 642.5, "epoch": 2.1843126385809315, "grad_norm": 0.0, "kl": 0.2827220559120178, "learning_rate": 3.0184944809124356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7881 }, { "completion_length": 654.5, "epoch": 2.184589800443459, "grad_norm": 0.0, "kl": 0.27448856830596924, "learning_rate": 3.0180662003841326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7882 }, { "completion_length": 564.75, "epoch": 2.184866962305987, "grad_norm": 0.0, "kl": 0.22626431286334991, "learning_rate": 3.0176379039688952e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7883 }, { "completion_length": 445.0, "epoch": 2.1851441241685143, "grad_norm": 0.0, "kl": 0.2051040232181549, "learning_rate": 3.0172095916798547e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7884 }, { "completion_length": 523.0, "epoch": 2.1854212860310422, "grad_norm": 0.0, "kl": 2.8101210594177246, "learning_rate": 3.01678126353015e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7885 }, { "completion_length": 625.25, "epoch": 2.1856984478935697, "grad_norm": 0.0, "kl": 0.16823841631412506, "learning_rate": 3.016352919532912e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7886 }, { "completion_length": 659.75, "epoch": 2.1859756097560976, "grad_norm": 0.34826406836509705, "kl": 71134.46875, "learning_rate": 3.0159245597012794e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7887 }, { "completion_length": 611.0, "epoch": 2.186252771618625, "grad_norm": 0.0, "kl": 645758.0, "learning_rate": 3.0154961840483854e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7888 }, { "completion_length": 549.5, "epoch": 2.186529933481153, "grad_norm": 0.0, "kl": 1.2982105016708374, "learning_rate": 3.0150677925873684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7889 }, { "completion_length": 706.0, "epoch": 2.186807095343681, "grad_norm": 0.0, "kl": 0.16088396310806274, "learning_rate": 3.0146393853313647e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7890 }, { "completion_length": 627.25, "epoch": 2.1870842572062084, "grad_norm": 0.0, "kl": 0.17612910270690918, "learning_rate": 3.0142109622935118e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7891 }, { "completion_length": 519.5, "epoch": 2.1873614190687363, "grad_norm": 0.0, "kl": 0.17979030311107635, "learning_rate": 3.013782523486948e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7892 }, { "completion_length": 494.0, "epoch": 2.1876385809312637, "grad_norm": 0.0, "kl": 2.1379783153533936, "learning_rate": 3.013354068924811e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7893 }, { "completion_length": 592.25, "epoch": 2.1879157427937916, "grad_norm": 0.0, "kl": 0.12616927921772003, "learning_rate": 3.012925598620241e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7894 }, { "completion_length": 454.25, "epoch": 2.188192904656319, "grad_norm": 0.0, "kl": 0.1642223745584488, "learning_rate": 3.0124971125863756e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7895 }, { "completion_length": 652.25, "epoch": 2.188470066518847, "grad_norm": 0.0, "kl": 0.14837121963500977, "learning_rate": 3.0120686108363567e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7896 }, { "completion_length": 532.0, "epoch": 2.188747228381375, "grad_norm": 0.0, "kl": 0.14482638239860535, "learning_rate": 3.0116400933833232e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7897 }, { "completion_length": 597.0, "epoch": 2.1890243902439024, "grad_norm": 0.4922001361846924, "kl": 20951467032576.0, "learning_rate": 3.011211560240417e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7898 }, { "completion_length": 519.75, "epoch": 2.1893015521064303, "grad_norm": 0.0, "kl": 0.16368871927261353, "learning_rate": 3.0107830114207775e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7899 }, { "completion_length": 651.0, "epoch": 2.1895787139689578, "grad_norm": 0.0, "kl": 0.2543543577194214, "learning_rate": 3.010354446937549e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7900 }, { "completion_length": 516.5, "epoch": 2.1898558758314857, "grad_norm": 0.0, "kl": 0.20492751896381378, "learning_rate": 3.009925866803872e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7901 }, { "completion_length": 684.75, "epoch": 2.190133037694013, "grad_norm": 0.0, "kl": 2.8805856704711914, "learning_rate": 3.009497271032891e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7902 }, { "completion_length": 647.75, "epoch": 2.190410199556541, "grad_norm": 0.0, "kl": 0.1629745364189148, "learning_rate": 3.009068659637747e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7903 }, { "completion_length": 654.0, "epoch": 2.1906873614190685, "grad_norm": 0.0, "kl": 0.22506752610206604, "learning_rate": 3.0086400326315853e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7904 }, { "completion_length": 547.75, "epoch": 2.1909645232815964, "grad_norm": 0.0, "kl": 1.1943622827529907, "learning_rate": 3.0082113900275496e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7905 }, { "completion_length": 580.75, "epoch": 2.1912416851441243, "grad_norm": 0.0, "kl": 0.15990625321865082, "learning_rate": 3.007782731838784e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7906 }, { "completion_length": 573.25, "epoch": 2.191518847006652, "grad_norm": 0.0, "kl": 0.17056210339069366, "learning_rate": 3.007354058078435e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7907 }, { "completion_length": 528.75, "epoch": 2.1917960088691797, "grad_norm": 0.0, "kl": 0.17636695504188538, "learning_rate": 3.0069253687596476e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7908 }, { "completion_length": 503.5, "epoch": 2.192073170731707, "grad_norm": 0.0, "kl": 0.1550329327583313, "learning_rate": 3.0064966638955677e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7909 }, { "completion_length": 639.75, "epoch": 2.192350332594235, "grad_norm": 0.5202773809432983, "kl": 1862498.625, "learning_rate": 3.006067943499342e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7910 }, { "completion_length": 655.25, "epoch": 2.1926274944567625, "grad_norm": 0.4362907111644745, "kl": 1779522.25, "learning_rate": 3.005639207584119e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7911 }, { "completion_length": 583.0, "epoch": 2.1929046563192904, "grad_norm": 0.6944326758384705, "kl": 283533.40625, "learning_rate": 3.0052104561630435e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7912 }, { "completion_length": 529.0, "epoch": 2.1931818181818183, "grad_norm": 0.0, "kl": 0.1812363862991333, "learning_rate": 3.0047816892492653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7913 }, { "completion_length": 468.0, "epoch": 2.193458980044346, "grad_norm": 0.0, "kl": 1.1042203903198242, "learning_rate": 3.004352906855932e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7914 }, { "completion_length": 541.25, "epoch": 2.1937361419068737, "grad_norm": 0.0, "kl": 0.19889110326766968, "learning_rate": 3.003924108996194e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7915 }, { "completion_length": 574.25, "epoch": 2.194013303769401, "grad_norm": 0.0, "kl": 1.0446105003356934, "learning_rate": 3.003495295683199e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7916 }, { "completion_length": 546.75, "epoch": 2.194290465631929, "grad_norm": 0.0, "kl": 0.42612993717193604, "learning_rate": 3.003066466930099e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7917 }, { "completion_length": 556.25, "epoch": 2.1945676274944566, "grad_norm": 0.0, "kl": 0.15350809693336487, "learning_rate": 3.002637622750042e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7918 }, { "completion_length": 601.5, "epoch": 2.1948447893569845, "grad_norm": 1.3447223901748657, "kl": 25496.140625, "learning_rate": 3.002208763156181e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7919 }, { "completion_length": 590.5, "epoch": 2.1951219512195124, "grad_norm": 0.0, "kl": 0.19224274158477783, "learning_rate": 3.0017798881616657e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7920 }, { "completion_length": 564.75, "epoch": 2.19539911308204, "grad_norm": 0.0, "kl": 0.1667676568031311, "learning_rate": 3.0013509977796483e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7921 }, { "completion_length": 566.5, "epoch": 2.1956762749445677, "grad_norm": 0.0, "kl": 0.14353330433368683, "learning_rate": 3.000922092023281e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7922 }, { "completion_length": 649.25, "epoch": 2.195953436807095, "grad_norm": 0.0, "kl": 0.15981830656528473, "learning_rate": 3.0004931709057183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7923 }, { "completion_length": 474.5, "epoch": 2.196230598669623, "grad_norm": 0.0, "kl": 0.22994212806224823, "learning_rate": 3.0000642344401115e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7924 }, { "completion_length": 593.75, "epoch": 2.1965077605321506, "grad_norm": 0.0, "kl": 0.14233528077602386, "learning_rate": 2.999635282639615e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7925 }, { "completion_length": 651.0, "epoch": 2.1967849223946785, "grad_norm": 0.0, "kl": 0.1438566893339157, "learning_rate": 2.9992063155173823e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7926 }, { "completion_length": 442.0, "epoch": 2.1970620842572064, "grad_norm": 0.0, "kl": 0.16884870827198029, "learning_rate": 2.9987773330865687e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7927 }, { "completion_length": 522.75, "epoch": 2.197339246119734, "grad_norm": 0.0, "kl": 0.2817457616329193, "learning_rate": 2.9983483353603298e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7928 }, { "completion_length": 548.5, "epoch": 2.1976164079822618, "grad_norm": 0.0, "kl": 0.19285771250724792, "learning_rate": 2.9979193223518195e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7929 }, { "completion_length": 538.75, "epoch": 2.1978935698447892, "grad_norm": 1.2511521577835083, "kl": 2615372288.0, "learning_rate": 2.9974902940741957e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7930 }, { "completion_length": 562.75, "epoch": 2.198170731707317, "grad_norm": 0.0, "kl": 0.9533612728118896, "learning_rate": 2.997061250540613e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7931 }, { "completion_length": 528.25, "epoch": 2.1984478935698446, "grad_norm": 0.0, "kl": 0.33585125207901, "learning_rate": 2.996632191764231e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7932 }, { "completion_length": 599.25, "epoch": 2.1987250554323725, "grad_norm": 0.0, "kl": 0.1432136744260788, "learning_rate": 2.9962031177582046e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7933 }, { "completion_length": 586.0, "epoch": 2.1990022172949004, "grad_norm": 0.0, "kl": 0.17267364263534546, "learning_rate": 2.9957740285356933e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7934 }, { "completion_length": 707.0, "epoch": 2.199279379157428, "grad_norm": 0.0, "kl": 0.1509568840265274, "learning_rate": 2.995344924109854e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7935 }, { "completion_length": 508.0, "epoch": 2.199556541019956, "grad_norm": 0.0, "kl": 0.21386633813381195, "learning_rate": 2.9949158044938474e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7936 }, { "completion_length": 578.0, "epoch": 2.1998337028824833, "grad_norm": 0.0, "kl": 0.1949356496334076, "learning_rate": 2.9944866697008314e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7937 }, { "completion_length": 557.5, "epoch": 2.200110864745011, "grad_norm": 0.0, "kl": 0.16823597252368927, "learning_rate": 2.9940575197439664e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7938 }, { "completion_length": 581.25, "epoch": 2.2003880266075386, "grad_norm": 0.0, "kl": 0.2218295931816101, "learning_rate": 2.9936283546364124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7939 }, { "completion_length": 499.75, "epoch": 2.2006651884700665, "grad_norm": 0.0, "kl": 1.3735589981079102, "learning_rate": 2.9931991743913307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7940 }, { "completion_length": 568.25, "epoch": 2.2009423503325944, "grad_norm": 0.34462079405784607, "kl": 0.16562286019325256, "learning_rate": 2.992769979021882e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7941 }, { "completion_length": 559.0, "epoch": 2.201219512195122, "grad_norm": 0.0, "kl": 0.28951582312583923, "learning_rate": 2.992340768541227e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7942 }, { "completion_length": 598.75, "epoch": 2.20149667405765, "grad_norm": 0.0, "kl": 0.22372157871723175, "learning_rate": 2.9919115429625295e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7943 }, { "completion_length": 638.75, "epoch": 2.2017738359201773, "grad_norm": 0.0, "kl": 0.18643170595169067, "learning_rate": 2.99148230229895e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7944 }, { "completion_length": 664.25, "epoch": 2.202050997782705, "grad_norm": 0.4611158072948456, "kl": 58821363892224.0, "learning_rate": 2.9910530465636544e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7945 }, { "completion_length": 488.25, "epoch": 2.2023281596452327, "grad_norm": 0.0, "kl": 0.21517710387706757, "learning_rate": 2.9906237757698036e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7946 }, { "completion_length": 538.75, "epoch": 2.2026053215077606, "grad_norm": 0.0, "kl": 0.15823182463645935, "learning_rate": 2.9901944899305635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7947 }, { "completion_length": 610.5, "epoch": 2.2028824833702885, "grad_norm": 0.0, "kl": 0.15353940427303314, "learning_rate": 2.9897651890590966e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7948 }, { "completion_length": 778.25, "epoch": 2.203159645232816, "grad_norm": 0.0, "kl": 0.19851884245872498, "learning_rate": 2.9893358731685696e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7949 }, { "completion_length": 532.75, "epoch": 2.203436807095344, "grad_norm": 0.0, "kl": 0.20319435000419617, "learning_rate": 2.9889065422721463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7950 }, { "completion_length": 627.0, "epoch": 2.2037139689578713, "grad_norm": 0.35760655999183655, "kl": 1316969.5, "learning_rate": 2.988477196382993e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7951 }, { "completion_length": 506.75, "epoch": 2.203991130820399, "grad_norm": 0.0, "kl": 0.17031177878379822, "learning_rate": 2.988047835514276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7952 }, { "completion_length": 529.5, "epoch": 2.2042682926829267, "grad_norm": 0.0, "kl": 0.1887090504169464, "learning_rate": 2.987618459679163e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7953 }, { "completion_length": 551.75, "epoch": 2.2045454545454546, "grad_norm": 0.0, "kl": 0.2445765733718872, "learning_rate": 2.98718906889082e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7954 }, { "completion_length": 638.75, "epoch": 2.2048226164079825, "grad_norm": 0.0, "kl": 0.13731065392494202, "learning_rate": 2.9867596631624146e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7955 }, { "completion_length": 604.75, "epoch": 2.20509977827051, "grad_norm": 0.0, "kl": 0.16398245096206665, "learning_rate": 2.9863302425071156e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7956 }, { "completion_length": 575.25, "epoch": 2.205376940133038, "grad_norm": 0.0, "kl": 0.17252217233181, "learning_rate": 2.98590080693809e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7957 }, { "completion_length": 586.5, "epoch": 2.2056541019955653, "grad_norm": 0.0, "kl": 0.1798286885023117, "learning_rate": 2.9854713564685095e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7958 }, { "completion_length": 566.5, "epoch": 2.2059312638580932, "grad_norm": 0.37735897302627563, "kl": 284005987516416.0, "learning_rate": 2.9850418911115414e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7959 }, { "completion_length": 636.0, "epoch": 2.2062084257206207, "grad_norm": 0.0, "kl": 0.18221762776374817, "learning_rate": 2.9846124108803557e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7960 }, { "completion_length": 517.5, "epoch": 2.2064855875831486, "grad_norm": 0.0, "kl": 0.19017474353313446, "learning_rate": 2.9841829157881236e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7961 }, { "completion_length": 503.5, "epoch": 2.206762749445676, "grad_norm": 0.0, "kl": 0.21752052009105682, "learning_rate": 2.983753405848016e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7962 }, { "completion_length": 509.0, "epoch": 2.207039911308204, "grad_norm": 0.0, "kl": 0.18870759010314941, "learning_rate": 2.9833238810732036e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7963 }, { "completion_length": 580.0, "epoch": 2.207317073170732, "grad_norm": 0.0, "kl": 0.2640552520751953, "learning_rate": 2.9828943414768583e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7964 }, { "completion_length": 587.0, "epoch": 2.2075942350332594, "grad_norm": 0.0, "kl": 0.190085768699646, "learning_rate": 2.9824647870721525e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7965 }, { "completion_length": 574.5, "epoch": 2.2078713968957873, "grad_norm": 0.0, "kl": 0.2438545674085617, "learning_rate": 2.982035217872258e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7966 }, { "completion_length": 526.0, "epoch": 2.2081485587583147, "grad_norm": 0.0, "kl": 4.459245681762695, "learning_rate": 2.9816056338903493e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7967 }, { "completion_length": 536.0, "epoch": 2.2084257206208426, "grad_norm": 0.0, "kl": 0.20072269439697266, "learning_rate": 2.9811760351395994e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7968 }, { "completion_length": 570.75, "epoch": 2.20870288248337, "grad_norm": 0.0, "kl": 0.1639743149280548, "learning_rate": 2.9807464216331815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7969 }, { "completion_length": 638.25, "epoch": 2.208980044345898, "grad_norm": 0.0, "kl": 0.12663371860980988, "learning_rate": 2.9803167933842712e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7970 }, { "completion_length": 560.5, "epoch": 2.209257206208426, "grad_norm": 0.0, "kl": 0.15343990921974182, "learning_rate": 2.979887150406043e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7971 }, { "completion_length": 558.5, "epoch": 2.2095343680709534, "grad_norm": 0.0, "kl": 0.15085218846797943, "learning_rate": 2.9794574927116716e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7972 }, { "completion_length": 605.0, "epoch": 2.2098115299334813, "grad_norm": 0.0, "kl": 0.13819679617881775, "learning_rate": 2.9790278203143342e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7973 }, { "completion_length": 491.5, "epoch": 2.2100886917960088, "grad_norm": 0.0, "kl": 0.1608244776725769, "learning_rate": 2.9785981332272052e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7974 }, { "completion_length": 592.25, "epoch": 2.2103658536585367, "grad_norm": 0.0, "kl": 0.1478491723537445, "learning_rate": 2.978168431463463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7975 }, { "completion_length": 595.25, "epoch": 2.210643015521064, "grad_norm": 0.0, "kl": 0.1476295441389084, "learning_rate": 2.977738715036284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7976 }, { "completion_length": 567.5, "epoch": 2.210920177383592, "grad_norm": 0.0, "kl": 0.1811279058456421, "learning_rate": 2.9773089839588463e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7977 }, { "completion_length": 627.5, "epoch": 2.21119733924612, "grad_norm": 0.0, "kl": 0.15821674466133118, "learning_rate": 2.976879238244327e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7978 }, { "completion_length": 541.25, "epoch": 2.2114745011086474, "grad_norm": 0.0, "kl": 0.17711730301380157, "learning_rate": 2.976449477905906e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7979 }, { "completion_length": 656.5, "epoch": 2.2117516629711753, "grad_norm": 0.48078200221061707, "kl": 159528775581696.0, "learning_rate": 2.976019702956761e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7980 }, { "completion_length": 579.75, "epoch": 2.212028824833703, "grad_norm": 0.0, "kl": 0.1772947609424591, "learning_rate": 2.975589913410072e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7981 }, { "completion_length": 621.75, "epoch": 2.2123059866962307, "grad_norm": 0.0, "kl": 0.2851680517196655, "learning_rate": 2.9751601092790185e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7982 }, { "completion_length": 588.75, "epoch": 2.212583148558758, "grad_norm": 0.0, "kl": 0.18183839321136475, "learning_rate": 2.9747302905767816e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7983 }, { "completion_length": 553.75, "epoch": 2.212860310421286, "grad_norm": 0.0, "kl": 829949548167168.0, "learning_rate": 2.9743004573165408e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7984 }, { "completion_length": 592.5, "epoch": 2.2131374722838135, "grad_norm": 0.0, "kl": 1.262474775314331, "learning_rate": 2.9738706095114788e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7985 }, { "completion_length": 520.75, "epoch": 2.2134146341463414, "grad_norm": 0.0, "kl": 0.1628161072731018, "learning_rate": 2.9734407471747762e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7986 }, { "completion_length": 549.0, "epoch": 2.2136917960088693, "grad_norm": 0.5030738115310669, "kl": 351061634187264.0, "learning_rate": 2.973010870319615e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7987 }, { "completion_length": 549.25, "epoch": 2.213968957871397, "grad_norm": 0.0, "kl": 0.17159302532672882, "learning_rate": 2.972580978959179e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7988 }, { "completion_length": 589.0, "epoch": 2.2142461197339247, "grad_norm": 0.0, "kl": 0.1751900315284729, "learning_rate": 2.972151073106649e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7989 }, { "completion_length": 517.25, "epoch": 2.214523281596452, "grad_norm": 0.0, "kl": 0.17840726673603058, "learning_rate": 2.97172115277521e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7990 }, { "completion_length": 542.5, "epoch": 2.21480044345898, "grad_norm": 0.0, "kl": 4.981210708618164, "learning_rate": 2.9712912179780457e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7991 }, { "completion_length": 541.75, "epoch": 2.2150776053215075, "grad_norm": 0.0, "kl": 0.1811026781797409, "learning_rate": 2.9708612687283407e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7992 }, { "completion_length": 507.5, "epoch": 2.2153547671840355, "grad_norm": 0.0, "kl": 0.15556120872497559, "learning_rate": 2.9704313050392785e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7993 }, { "completion_length": 514.0, "epoch": 2.2156319290465634, "grad_norm": 0.0, "kl": 824853636579328.0, "learning_rate": 2.9700013269240463e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7994 }, { "completion_length": 475.75, "epoch": 2.215909090909091, "grad_norm": 0.0, "kl": 0.17932258546352386, "learning_rate": 2.9695713343958277e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7995 }, { "completion_length": 439.25, "epoch": 2.2161862527716187, "grad_norm": 0.0, "kl": 0.18300117552280426, "learning_rate": 2.9691413274678093e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7996 }, { "completion_length": 537.25, "epoch": 2.216463414634146, "grad_norm": 0.0, "kl": 0.20435650646686554, "learning_rate": 2.9687113061531785e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7997 }, { "completion_length": 499.25, "epoch": 2.216740576496674, "grad_norm": 0.0, "kl": 0.1638643443584442, "learning_rate": 2.968281270465122e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7998 }, { "completion_length": 492.5, "epoch": 2.2170177383592016, "grad_norm": 0.0, "kl": 2.049433946609497, "learning_rate": 2.9678512204168263e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 7999 }, { "completion_length": 580.25, "epoch": 2.2172949002217295, "grad_norm": 0.37070590257644653, "kl": 3444662824075264.0, "learning_rate": 2.967421156021481e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8000 }, { "completion_length": 588.5, "epoch": 2.2175720620842574, "grad_norm": 0.7442098259925842, "kl": 2433603094970368.0, "learning_rate": 2.9669910772922724e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8001 }, { "completion_length": 469.25, "epoch": 2.217849223946785, "grad_norm": 0.0, "kl": 0.1833350509405136, "learning_rate": 2.9665609842423904e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8002 }, { "completion_length": 509.25, "epoch": 2.2181263858093128, "grad_norm": 0.0, "kl": 0.21471956372261047, "learning_rate": 2.9661308768850233e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8003 }, { "completion_length": 530.75, "epoch": 2.2184035476718402, "grad_norm": 0.0, "kl": 0.21697385609149933, "learning_rate": 2.9657007552333615e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8004 }, { "completion_length": 488.25, "epoch": 2.218680709534368, "grad_norm": 0.0, "kl": 0.5024918913841248, "learning_rate": 2.9652706193005947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8005 }, { "completion_length": 630.75, "epoch": 2.2189578713968956, "grad_norm": 0.0, "kl": 0.151175394654274, "learning_rate": 2.9648404690999144e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8006 }, { "completion_length": 510.0, "epoch": 2.2192350332594235, "grad_norm": 0.0, "kl": 0.18699929118156433, "learning_rate": 2.9644103046445095e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8007 }, { "completion_length": 578.0, "epoch": 2.2195121951219514, "grad_norm": 0.0, "kl": 0.17510774731636047, "learning_rate": 2.963980125947573e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8008 }, { "completion_length": 514.25, "epoch": 2.219789356984479, "grad_norm": 0.0, "kl": 0.39588478207588196, "learning_rate": 2.9635499330222963e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8009 }, { "completion_length": 528.5, "epoch": 2.220066518847007, "grad_norm": 0.0, "kl": 0.20154796540737152, "learning_rate": 2.9631197258818713e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8010 }, { "completion_length": 676.5, "epoch": 2.2203436807095343, "grad_norm": 0.0, "kl": 0.17273496091365814, "learning_rate": 2.9626895045394898e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8011 }, { "completion_length": 530.25, "epoch": 2.220620842572062, "grad_norm": 0.0, "kl": 0.18572010099887848, "learning_rate": 2.9622592690083475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8012 }, { "completion_length": 524.5, "epoch": 2.2208980044345896, "grad_norm": 0.0, "kl": 0.19387181103229523, "learning_rate": 2.9618290193016357e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8013 }, { "completion_length": 532.0, "epoch": 2.2211751662971175, "grad_norm": 0.0, "kl": 0.29519984126091003, "learning_rate": 2.961398755432549e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8014 }, { "completion_length": 601.0, "epoch": 2.2214523281596454, "grad_norm": 0.0, "kl": 0.1433882862329483, "learning_rate": 2.960968477414282e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8015 }, { "completion_length": 563.25, "epoch": 2.221729490022173, "grad_norm": 0.0, "kl": 0.22676125168800354, "learning_rate": 2.960538185260029e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8016 }, { "completion_length": 525.75, "epoch": 2.222006651884701, "grad_norm": 0.0, "kl": 0.24064074456691742, "learning_rate": 2.9601078789829866e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8017 }, { "completion_length": 549.25, "epoch": 2.2222838137472283, "grad_norm": 0.0, "kl": 0.16892535984516144, "learning_rate": 2.959677558596349e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8018 }, { "completion_length": 487.75, "epoch": 2.222560975609756, "grad_norm": 0.5575457215309143, "kl": 1518522.625, "learning_rate": 2.9592472241133124e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8019 }, { "completion_length": 561.5, "epoch": 2.2228381374722836, "grad_norm": 0.0, "kl": 0.17537622153759003, "learning_rate": 2.9588168755470743e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8020 }, { "completion_length": 505.0, "epoch": 2.2231152993348116, "grad_norm": 0.0, "kl": 0.1919957399368286, "learning_rate": 2.958386512910831e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8021 }, { "completion_length": 602.25, "epoch": 2.2233924611973395, "grad_norm": 0.0, "kl": 0.17247778177261353, "learning_rate": 2.9579561362177807e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8022 }, { "completion_length": 605.0, "epoch": 2.223669623059867, "grad_norm": 0.0, "kl": 0.15164920687675476, "learning_rate": 2.95752574548112e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8023 }, { "completion_length": 540.75, "epoch": 2.223946784922395, "grad_norm": 0.402291476726532, "kl": 8730281238331392.0, "learning_rate": 2.957095340714049e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8024 }, { "completion_length": 534.0, "epoch": 2.2242239467849223, "grad_norm": 0.0, "kl": 0.17748284339904785, "learning_rate": 2.9566649219297645e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8025 }, { "completion_length": 603.0, "epoch": 2.22450110864745, "grad_norm": 0.0, "kl": 0.16734828054904938, "learning_rate": 2.956234489141466e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8026 }, { "completion_length": 662.75, "epoch": 2.2247782705099777, "grad_norm": 0.0, "kl": 0.13692501187324524, "learning_rate": 2.9558040423623545e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8027 }, { "completion_length": 608.25, "epoch": 2.2250554323725056, "grad_norm": 0.0, "kl": 0.17849870026111603, "learning_rate": 2.9553735816056293e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8028 }, { "completion_length": 579.0, "epoch": 2.2253325942350335, "grad_norm": 0.375311017036438, "kl": 0.2803828716278076, "learning_rate": 2.95494310688449e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8029 }, { "completion_length": 652.75, "epoch": 2.225609756097561, "grad_norm": 0.0, "kl": 0.1298690140247345, "learning_rate": 2.954512618212139e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8030 }, { "completion_length": 718.25, "epoch": 2.225886917960089, "grad_norm": 0.0, "kl": 0.15530139207839966, "learning_rate": 2.954082115601776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8031 }, { "completion_length": 657.25, "epoch": 2.2261640798226163, "grad_norm": 0.0, "kl": 0.12872178852558136, "learning_rate": 2.9536515990666044e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8032 }, { "completion_length": 592.0, "epoch": 2.2264412416851442, "grad_norm": 0.0, "kl": 16969.474609375, "learning_rate": 2.9532210686198237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8033 }, { "completion_length": 646.75, "epoch": 2.2267184035476717, "grad_norm": 0.0, "kl": 0.15116696059703827, "learning_rate": 2.9527905242746397e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8034 }, { "completion_length": 568.75, "epoch": 2.2269955654101996, "grad_norm": 0.0, "kl": 0.8566405773162842, "learning_rate": 2.952359966044253e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8035 }, { "completion_length": 530.5, "epoch": 2.227272727272727, "grad_norm": 0.0, "kl": 0.2162516713142395, "learning_rate": 2.9519293939418687e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8036 }, { "completion_length": 525.25, "epoch": 2.227549889135255, "grad_norm": 0.0, "kl": 0.2050503045320511, "learning_rate": 2.9514988079806893e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8037 }, { "completion_length": 674.5, "epoch": 2.227827050997783, "grad_norm": 0.0, "kl": 0.14366239309310913, "learning_rate": 2.9510682081739195e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8038 }, { "completion_length": 593.5, "epoch": 2.2281042128603104, "grad_norm": 0.0, "kl": 0.16960078477859497, "learning_rate": 2.950637594534765e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8039 }, { "completion_length": 563.25, "epoch": 2.2283813747228383, "grad_norm": 0.0, "kl": 0.1702904999256134, "learning_rate": 2.9502069670764296e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8040 }, { "completion_length": 538.75, "epoch": 2.2286585365853657, "grad_norm": 0.0, "kl": 0.21186111867427826, "learning_rate": 2.9497763258121188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8041 }, { "completion_length": 634.0, "epoch": 2.2289356984478936, "grad_norm": 0.0, "kl": 0.127048060297966, "learning_rate": 2.9493456707550394e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8042 }, { "completion_length": 635.0, "epoch": 2.229212860310421, "grad_norm": 0.0, "kl": 0.18602871894836426, "learning_rate": 2.9489150019183975e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8043 }, { "completion_length": 535.5, "epoch": 2.229490022172949, "grad_norm": 0.6148238182067871, "kl": 1792175.0, "learning_rate": 2.9484843193154e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8044 }, { "completion_length": 527.5, "epoch": 2.229767184035477, "grad_norm": 0.0, "kl": 0.19734320044517517, "learning_rate": 2.948053622959255e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8045 }, { "completion_length": 634.5, "epoch": 2.2300443458980044, "grad_norm": 0.0, "kl": 0.14162899553775787, "learning_rate": 2.9476229128631677e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8046 }, { "completion_length": 578.0, "epoch": 2.2303215077605323, "grad_norm": 0.0, "kl": 1836064696172544.0, "learning_rate": 2.9471921890403487e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8047 }, { "completion_length": 519.75, "epoch": 2.2305986696230597, "grad_norm": 0.0, "kl": 0.2683772146701813, "learning_rate": 2.946761451504005e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8048 }, { "completion_length": 570.5, "epoch": 2.2308758314855877, "grad_norm": 0.0, "kl": 0.16816923022270203, "learning_rate": 2.9463307002673465e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8049 }, { "completion_length": 697.5, "epoch": 2.231152993348115, "grad_norm": 1.370906114578247, "kl": 189231693824.0, "learning_rate": 2.945899935343582e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8050 }, { "completion_length": 614.25, "epoch": 2.231430155210643, "grad_norm": 4.51113748550415, "kl": 36645931843584.0, "learning_rate": 2.9454691567459213e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8051 }, { "completion_length": 644.75, "epoch": 2.231707317073171, "grad_norm": 0.0, "kl": 0.1336781084537506, "learning_rate": 2.9450383644875746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8052 }, { "completion_length": 578.0, "epoch": 2.2319844789356984, "grad_norm": 0.3598873019218445, "kl": 0.16861578822135925, "learning_rate": 2.944607558581753e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8053 }, { "completion_length": 563.5, "epoch": 2.2322616407982263, "grad_norm": 0.0, "kl": 0.23057524859905243, "learning_rate": 2.9441767390416665e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8054 }, { "completion_length": 631.25, "epoch": 2.2325388026607538, "grad_norm": 0.0, "kl": 0.1610339730978012, "learning_rate": 2.9437459058805274e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8055 }, { "completion_length": 554.5, "epoch": 2.2328159645232817, "grad_norm": 0.0, "kl": 0.15330477058887482, "learning_rate": 2.9433150591115474e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8056 }, { "completion_length": 711.25, "epoch": 2.233093126385809, "grad_norm": 0.0, "kl": 0.17458459734916687, "learning_rate": 2.9428841987479384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8057 }, { "completion_length": 564.25, "epoch": 2.233370288248337, "grad_norm": 0.0, "kl": 0.17895588278770447, "learning_rate": 2.9424533248029134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8058 }, { "completion_length": 448.25, "epoch": 2.2336474501108645, "grad_norm": 0.0, "kl": 0.18129919469356537, "learning_rate": 2.9420224372896856e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8059 }, { "completion_length": 559.5, "epoch": 2.2339246119733924, "grad_norm": 0.0, "kl": 0.17989283800125122, "learning_rate": 2.941591536221469e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8060 }, { "completion_length": 584.75, "epoch": 2.2342017738359203, "grad_norm": 0.4330572783946991, "kl": 546900767932416.0, "learning_rate": 2.941160621611476e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8061 }, { "completion_length": 624.0, "epoch": 2.234478935698448, "grad_norm": 0.0, "kl": 0.13784842193126678, "learning_rate": 2.9407296934729227e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8062 }, { "completion_length": 620.25, "epoch": 2.2347560975609757, "grad_norm": 0.0, "kl": 0.181092768907547, "learning_rate": 2.9402987518190223e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8063 }, { "completion_length": 604.75, "epoch": 2.235033259423503, "grad_norm": 0.31992653012275696, "kl": 0.13236990571022034, "learning_rate": 2.939867796662992e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8064 }, { "completion_length": 612.5, "epoch": 2.235310421286031, "grad_norm": 0.0, "kl": 0.14985330402851105, "learning_rate": 2.9394368280180447e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8065 }, { "completion_length": 588.25, "epoch": 2.2355875831485585, "grad_norm": 0.0, "kl": 0.11926046013832092, "learning_rate": 2.9390058458973993e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8066 }, { "completion_length": 523.0, "epoch": 2.2358647450110865, "grad_norm": 0.0, "kl": 0.17295163869857788, "learning_rate": 2.9385748503142697e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8067 }, { "completion_length": 584.0, "epoch": 2.2361419068736144, "grad_norm": 0.0, "kl": 0.15254461765289307, "learning_rate": 2.9381438412818748e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8068 }, { "completion_length": 566.25, "epoch": 2.236419068736142, "grad_norm": 0.0, "kl": 0.14749397337436676, "learning_rate": 2.9377128188134303e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8069 }, { "completion_length": 570.5, "epoch": 2.2366962305986697, "grad_norm": 0.0, "kl": 0.15777380764484406, "learning_rate": 2.9372817829221538e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8070 }, { "completion_length": 617.5, "epoch": 2.236973392461197, "grad_norm": 0.0, "kl": 0.1802506446838379, "learning_rate": 2.936850733621265e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8071 }, { "completion_length": 539.5, "epoch": 2.237250554323725, "grad_norm": 0.0, "kl": 0.18157576024532318, "learning_rate": 2.9364196709239816e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8072 }, { "completion_length": 497.5, "epoch": 2.2375277161862526, "grad_norm": 0.0, "kl": 0.17310132086277008, "learning_rate": 2.935988594843522e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8073 }, { "completion_length": 677.0, "epoch": 2.2378048780487805, "grad_norm": 0.0, "kl": 0.1343829482793808, "learning_rate": 2.9355575053931057e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8074 }, { "completion_length": 699.75, "epoch": 2.2380820399113084, "grad_norm": 0.36036571860313416, "kl": 1525126.375, "learning_rate": 2.935126402585953e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8075 }, { "completion_length": 526.75, "epoch": 2.238359201773836, "grad_norm": 0.0, "kl": 7.219672203063965, "learning_rate": 2.9346952864352835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8076 }, { "completion_length": 552.25, "epoch": 2.2386363636363638, "grad_norm": 0.0, "kl": 0.15694940090179443, "learning_rate": 2.9342641569543177e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8077 }, { "completion_length": 518.5, "epoch": 2.238913525498891, "grad_norm": 0.0, "kl": 0.18182814121246338, "learning_rate": 2.9338330141562766e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8078 }, { "completion_length": 583.25, "epoch": 2.239190687361419, "grad_norm": 0.3719337582588196, "kl": 0.21332179009914398, "learning_rate": 2.933401858054382e-06, "loss": -0.0, "reward": 1.59375, "reward_std": 0.3125, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 8079 }, { "completion_length": 533.75, "epoch": 2.2394678492239466, "grad_norm": 0.0, "kl": 0.2354896366596222, "learning_rate": 2.9329706886618547e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8080 }, { "completion_length": 523.5, "epoch": 2.2397450110864745, "grad_norm": 0.0, "kl": 0.16894927620887756, "learning_rate": 2.9325395059919187e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8081 }, { "completion_length": 465.25, "epoch": 2.2400221729490024, "grad_norm": 0.0, "kl": 0.16826632618904114, "learning_rate": 2.932108310057794e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8082 }, { "completion_length": 526.25, "epoch": 2.24029933481153, "grad_norm": 0.0, "kl": 0.1899099051952362, "learning_rate": 2.9316771008727063e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8083 }, { "completion_length": 583.5, "epoch": 2.240576496674058, "grad_norm": 0.0, "kl": 0.1428980529308319, "learning_rate": 2.9312458784498763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8084 }, { "completion_length": 537.75, "epoch": 2.2408536585365852, "grad_norm": 0.0, "kl": 0.21034348011016846, "learning_rate": 2.93081464280253e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8085 }, { "completion_length": 559.5, "epoch": 2.241130820399113, "grad_norm": 0.0, "kl": 0.14824865758419037, "learning_rate": 2.9303833939438905e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8086 }, { "completion_length": 513.75, "epoch": 2.2414079822616406, "grad_norm": 0.0, "kl": 0.6448432803153992, "learning_rate": 2.9299521318871827e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8087 }, { "completion_length": 553.75, "epoch": 2.2416851441241685, "grad_norm": 0.0, "kl": 7850229235712.0, "learning_rate": 2.9295208566456317e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8088 }, { "completion_length": 495.75, "epoch": 2.2419623059866964, "grad_norm": 0.0, "kl": 0.16492633521556854, "learning_rate": 2.929089568232462e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8089 }, { "completion_length": 538.5, "epoch": 2.242239467849224, "grad_norm": 0.0, "kl": 0.18756937980651855, "learning_rate": 2.9286582666609013e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8090 }, { "completion_length": 506.0, "epoch": 2.242516629711752, "grad_norm": 4.930966377258301, "kl": 3571049216.0, "learning_rate": 2.9282269519441743e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8091 }, { "completion_length": 509.0, "epoch": 2.2427937915742793, "grad_norm": 0.0, "kl": 0.1688888967037201, "learning_rate": 2.9277956240955084e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8092 }, { "completion_length": 605.75, "epoch": 2.243070953436807, "grad_norm": 0.0, "kl": 0.5623878836631775, "learning_rate": 2.9273642831281292e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8093 }, { "completion_length": 786.5, "epoch": 2.2433481152993346, "grad_norm": 0.0, "kl": 0.09234777837991714, "learning_rate": 2.926932929055266e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8094 }, { "completion_length": 525.0, "epoch": 2.2436252771618626, "grad_norm": 0.0, "kl": 4904.6533203125, "learning_rate": 2.926501561890146e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8095 }, { "completion_length": 741.0, "epoch": 2.2439024390243905, "grad_norm": 0.30089622735977173, "kl": 0.12949097156524658, "learning_rate": 2.9260701816459973e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8096 }, { "completion_length": 512.0, "epoch": 2.244179600886918, "grad_norm": 0.0, "kl": 0.18585370481014252, "learning_rate": 2.925638788336048e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8097 }, { "completion_length": 514.0, "epoch": 2.244456762749446, "grad_norm": 0.6543210744857788, "kl": 1030505.4375, "learning_rate": 2.925207381973529e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8098 }, { "completion_length": 645.5, "epoch": 2.2447339246119733, "grad_norm": 0.0, "kl": 0.14947156608104706, "learning_rate": 2.924775962571667e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8099 }, { "completion_length": 495.0, "epoch": 2.245011086474501, "grad_norm": 0.0, "kl": 0.16058744490146637, "learning_rate": 2.924344530143694e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8100 }, { "completion_length": 623.0, "epoch": 2.2452882483370287, "grad_norm": 0.0, "kl": 0.14302587509155273, "learning_rate": 2.9239130847028384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8101 }, { "completion_length": 549.5, "epoch": 2.2455654101995566, "grad_norm": 0.0, "kl": 0.2047315239906311, "learning_rate": 2.9234816262623334e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8102 }, { "completion_length": 482.25, "epoch": 2.2458425720620845, "grad_norm": 0.0, "kl": 0.1844949722290039, "learning_rate": 2.9230501548354077e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8103 }, { "completion_length": 524.75, "epoch": 2.246119733924612, "grad_norm": 0.0, "kl": 0.22709079086780548, "learning_rate": 2.9226186704352944e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8104 }, { "completion_length": 464.5, "epoch": 2.24639689578714, "grad_norm": 0.0, "kl": 0.17034508287906647, "learning_rate": 2.9221871730752234e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8105 }, { "completion_length": 506.75, "epoch": 2.2466740576496673, "grad_norm": 0.0, "kl": 0.15689805150032043, "learning_rate": 2.9217556627684283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8106 }, { "completion_length": 624.0, "epoch": 2.2469512195121952, "grad_norm": 0.0, "kl": 0.235188290476799, "learning_rate": 2.9213241395281423e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8107 }, { "completion_length": 489.75, "epoch": 2.2472283813747227, "grad_norm": 0.0, "kl": 0.16647350788116455, "learning_rate": 2.920892603367596e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8108 }, { "completion_length": 594.25, "epoch": 2.2475055432372506, "grad_norm": 0.0, "kl": 0.13581156730651855, "learning_rate": 2.920461054300026e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8109 }, { "completion_length": 516.75, "epoch": 2.247782705099778, "grad_norm": 0.0, "kl": 0.17946678400039673, "learning_rate": 2.920029492338664e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8110 }, { "completion_length": 515.25, "epoch": 2.248059866962306, "grad_norm": 0.0, "kl": 0.16292500495910645, "learning_rate": 2.919597917496745e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8111 }, { "completion_length": 588.5, "epoch": 2.248337028824834, "grad_norm": 0.5579349398612976, "kl": 288158.875, "learning_rate": 2.9191663297875027e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8112 }, { "completion_length": 606.25, "epoch": 2.2486141906873613, "grad_norm": 0.0, "kl": 0.15529297292232513, "learning_rate": 2.9187347292241742e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8113 }, { "completion_length": 713.5, "epoch": 2.2488913525498893, "grad_norm": 0.0, "kl": 0.13255399465560913, "learning_rate": 2.918303115819992e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8114 }, { "completion_length": 643.25, "epoch": 2.2491685144124167, "grad_norm": 0.0, "kl": 0.13344009220600128, "learning_rate": 2.9178714895881944e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8115 }, { "completion_length": 613.0, "epoch": 2.2494456762749446, "grad_norm": 0.38231196999549866, "kl": 3627771756544.0, "learning_rate": 2.917439850542015e-06, "loss": 0.0, "reward": 2.6875, "reward_std": 2.04506516456604, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.6875, "step": 8116 }, { "completion_length": 454.25, "epoch": 2.249722838137472, "grad_norm": 0.0, "kl": 0.20156042277812958, "learning_rate": 2.917008198694693e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8117 }, { "completion_length": 582.5, "epoch": 2.25, "grad_norm": 0.0, "kl": 0.162301167845726, "learning_rate": 2.9165765340594638e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8118 }, { "completion_length": 606.25, "epoch": 2.250277161862528, "grad_norm": 0.0, "kl": 0.15583482384681702, "learning_rate": 2.916144856649566e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8119 }, { "completion_length": 610.75, "epoch": 2.2505543237250554, "grad_norm": 0.0, "kl": 0.1435530185699463, "learning_rate": 2.9157131664782356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8120 }, { "completion_length": 534.0, "epoch": 2.2508314855875833, "grad_norm": 0.0, "kl": 0.19319236278533936, "learning_rate": 2.915281463558712e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8121 }, { "completion_length": 509.0, "epoch": 2.2511086474501107, "grad_norm": 0.0, "kl": 0.1753395050764084, "learning_rate": 2.9148497479042337e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8122 }, { "completion_length": 484.0, "epoch": 2.2513858093126387, "grad_norm": 1.2327486276626587, "kl": 178144411648.0, "learning_rate": 2.9144180195280393e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8123 }, { "completion_length": 591.5, "epoch": 2.251662971175166, "grad_norm": 0.0, "kl": 0.15684571862220764, "learning_rate": 2.913986278443368e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8124 }, { "completion_length": 562.0, "epoch": 2.251940133037694, "grad_norm": 0.0, "kl": 0.17694644629955292, "learning_rate": 2.9135545246634595e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8125 }, { "completion_length": 530.5, "epoch": 2.2522172949002215, "grad_norm": 0.0, "kl": 0.1689782440662384, "learning_rate": 2.9131227582015547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8126 }, { "completion_length": 476.75, "epoch": 2.2524944567627494, "grad_norm": 0.0, "kl": 0.1727539598941803, "learning_rate": 2.912690979070893e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8127 }, { "completion_length": 604.5, "epoch": 2.2527716186252773, "grad_norm": 0.0, "kl": 0.16167686879634857, "learning_rate": 2.912259187284716e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8128 }, { "completion_length": 537.75, "epoch": 2.2530487804878048, "grad_norm": 0.0, "kl": 0.14766660332679749, "learning_rate": 2.911827382856264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8129 }, { "completion_length": 529.5, "epoch": 2.2533259423503327, "grad_norm": 0.0, "kl": 0.1503930687904358, "learning_rate": 2.91139556579878e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8130 }, { "completion_length": 465.5, "epoch": 2.25360310421286, "grad_norm": 0.0, "kl": 0.17125380039215088, "learning_rate": 2.9109637361255048e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8131 }, { "completion_length": 510.0, "epoch": 2.253880266075388, "grad_norm": 0.0, "kl": 0.1607913374900818, "learning_rate": 2.9105318938496823e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8132 }, { "completion_length": 562.75, "epoch": 2.2541574279379155, "grad_norm": 0.0, "kl": 0.13662712275981903, "learning_rate": 2.910100038984554e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8133 }, { "completion_length": 542.0, "epoch": 2.2544345898004434, "grad_norm": 0.0, "kl": 0.16643516719341278, "learning_rate": 2.9096681715433633e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8134 }, { "completion_length": 529.0, "epoch": 2.2547117516629713, "grad_norm": 0.0, "kl": 0.15734300017356873, "learning_rate": 2.9092362915393542e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8135 }, { "completion_length": 669.25, "epoch": 2.254988913525499, "grad_norm": 0.0, "kl": 0.1325361430644989, "learning_rate": 2.9088043989857702e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8136 }, { "completion_length": 880.25, "epoch": 2.2552660753880267, "grad_norm": 0.0, "kl": 0.13564226031303406, "learning_rate": 2.9083724938958555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8137 }, { "completion_length": 541.75, "epoch": 2.255543237250554, "grad_norm": 0.0, "kl": 0.16170886158943176, "learning_rate": 2.907940576282856e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8138 }, { "completion_length": 513.25, "epoch": 2.255820399113082, "grad_norm": 0.0, "kl": 0.23777829110622406, "learning_rate": 2.9075086461600155e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8139 }, { "completion_length": 616.75, "epoch": 2.2560975609756095, "grad_norm": 0.0, "kl": 0.1631755828857422, "learning_rate": 2.9070767035405794e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8140 }, { "completion_length": 532.5, "epoch": 2.2563747228381374, "grad_norm": 3.4378161430358887, "kl": 579101.625, "learning_rate": 2.9066447484377957e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8141 }, { "completion_length": 534.75, "epoch": 2.2566518847006654, "grad_norm": 0.45360082387924194, "kl": 0.16255195438861847, "learning_rate": 2.9062127808649083e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8142 }, { "completion_length": 492.5, "epoch": 2.256929046563193, "grad_norm": 0.0, "kl": 0.15467770397663116, "learning_rate": 2.905780800835165e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8143 }, { "completion_length": 592.5, "epoch": 2.2572062084257207, "grad_norm": 0.36160174012184143, "kl": 0.6106769442558289, "learning_rate": 2.9053488083618118e-06, "loss": 0.0, "reward": 1.625, "reward_std": 0.25, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8144 }, { "completion_length": 663.75, "epoch": 2.257483370288248, "grad_norm": 0.3143302798271179, "kl": 0.14481136202812195, "learning_rate": 2.904916803458098e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8145 }, { "completion_length": 551.75, "epoch": 2.257760532150776, "grad_norm": 0.0, "kl": 0.202813059091568, "learning_rate": 2.904484786137269e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8146 }, { "completion_length": 578.0, "epoch": 2.2580376940133036, "grad_norm": 0.0, "kl": 0.1303793042898178, "learning_rate": 2.9040527564125754e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8147 }, { "completion_length": 502.25, "epoch": 2.2583148558758315, "grad_norm": 0.0, "kl": 0.16761620342731476, "learning_rate": 2.903620714297264e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8148 }, { "completion_length": 559.0, "epoch": 2.2585920177383594, "grad_norm": 0.6899760365486145, "kl": 25869.79296875, "learning_rate": 2.9031886598045844e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8149 }, { "completion_length": 606.75, "epoch": 2.258869179600887, "grad_norm": 0.0, "kl": 0.15227970480918884, "learning_rate": 2.902756592947786e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8150 }, { "completion_length": 579.0, "epoch": 2.2591463414634148, "grad_norm": 0.0, "kl": 6284941.0, "learning_rate": 2.902324513740118e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8151 }, { "completion_length": 550.5, "epoch": 2.259423503325942, "grad_norm": 0.0, "kl": 0.16484001278877258, "learning_rate": 2.9018924221948307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8152 }, { "completion_length": 538.5, "epoch": 2.25970066518847, "grad_norm": 0.0, "kl": 0.1867719292640686, "learning_rate": 2.901460318325175e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8153 }, { "completion_length": 573.75, "epoch": 2.2599778270509976, "grad_norm": 0.0, "kl": 74461.734375, "learning_rate": 2.9010282021444008e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8154 }, { "completion_length": 562.25, "epoch": 2.2602549889135255, "grad_norm": 0.0, "kl": 1214.6412353515625, "learning_rate": 2.9005960736657606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8155 }, { "completion_length": 697.5, "epoch": 2.2605321507760534, "grad_norm": 0.0, "kl": 0.12926191091537476, "learning_rate": 2.900163932902505e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8156 }, { "completion_length": 550.0, "epoch": 2.260809312638581, "grad_norm": 0.0, "kl": 0.17832960188388824, "learning_rate": 2.8997317798678858e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8157 }, { "completion_length": 594.75, "epoch": 2.261086474501109, "grad_norm": 0.0, "kl": 0.1485542505979538, "learning_rate": 2.8992996145751566e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8158 }, { "completion_length": 600.5, "epoch": 2.2613636363636362, "grad_norm": 0.0, "kl": 0.14165544509887695, "learning_rate": 2.8988674370375687e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8159 }, { "completion_length": 625.0, "epoch": 2.261640798226164, "grad_norm": 0.0, "kl": 0.15083332359790802, "learning_rate": 2.8984352472683753e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8160 }, { "completion_length": 589.0, "epoch": 2.2619179600886916, "grad_norm": 0.0, "kl": 0.16017821431159973, "learning_rate": 2.898003045280831e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8161 }, { "completion_length": 587.0, "epoch": 2.2621951219512195, "grad_norm": 1.9938509464263916, "kl": 41518063616.0, "learning_rate": 2.897570831088189e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8162 }, { "completion_length": 630.0, "epoch": 2.2624722838137474, "grad_norm": 0.0, "kl": 0.8220409154891968, "learning_rate": 2.897138604703703e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8163 }, { "completion_length": 591.5, "epoch": 2.262749445676275, "grad_norm": 0.0, "kl": 0.17991578578948975, "learning_rate": 2.896706366140629e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8164 }, { "completion_length": 598.75, "epoch": 2.263026607538803, "grad_norm": 0.0, "kl": 0.2732340395450592, "learning_rate": 2.89627411541222e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8165 }, { "completion_length": 558.5, "epoch": 2.2633037694013303, "grad_norm": 0.0, "kl": 0.15590092539787292, "learning_rate": 2.8958418525317326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8166 }, { "completion_length": 520.5, "epoch": 2.263580931263858, "grad_norm": 0.0, "kl": 0.14799119532108307, "learning_rate": 2.8954095775124225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8167 }, { "completion_length": 632.0, "epoch": 2.2638580931263856, "grad_norm": 0.0, "kl": 8337255424.0, "learning_rate": 2.8949772903675456e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8168 }, { "completion_length": 556.25, "epoch": 2.2641352549889135, "grad_norm": 0.0, "kl": 0.17866584658622742, "learning_rate": 2.894544991110357e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8169 }, { "completion_length": 568.5, "epoch": 2.2644124168514415, "grad_norm": 0.0, "kl": 0.16211992502212524, "learning_rate": 2.8941126797541163e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8170 }, { "completion_length": 639.25, "epoch": 2.264689578713969, "grad_norm": 0.0, "kl": 0.7455950975418091, "learning_rate": 2.8936803563120785e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8171 }, { "completion_length": 619.25, "epoch": 2.264966740576497, "grad_norm": 0.0, "kl": 0.16210757195949554, "learning_rate": 2.893248020797502e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8172 }, { "completion_length": 492.75, "epoch": 2.2652439024390243, "grad_norm": 0.0, "kl": 0.23157422244548798, "learning_rate": 2.892815673223645e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8173 }, { "completion_length": 622.25, "epoch": 2.265521064301552, "grad_norm": 0.4632183909416199, "kl": 24388671488.0, "learning_rate": 2.892383313603765e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8174 }, { "completion_length": 596.25, "epoch": 2.2657982261640797, "grad_norm": 0.0, "kl": 0.16496005654335022, "learning_rate": 2.8919509419511204e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8175 }, { "completion_length": 638.25, "epoch": 2.2660753880266076, "grad_norm": 0.0, "kl": 0.16674786806106567, "learning_rate": 2.891518558278972e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8176 }, { "completion_length": 612.0, "epoch": 2.2663525498891355, "grad_norm": 0.0, "kl": 0.14738334715366364, "learning_rate": 2.8910861626005774e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8177 }, { "completion_length": 546.0, "epoch": 2.266629711751663, "grad_norm": 0.0, "kl": 0.18592840433120728, "learning_rate": 2.8906537549291976e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8178 }, { "completion_length": 689.25, "epoch": 2.266906873614191, "grad_norm": 0.0, "kl": 0.15752796828746796, "learning_rate": 2.890221335278092e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8179 }, { "completion_length": 612.0, "epoch": 2.2671840354767183, "grad_norm": 0.0, "kl": 0.148220032453537, "learning_rate": 2.889788903660521e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8180 }, { "completion_length": 582.75, "epoch": 2.2674611973392462, "grad_norm": 0.0, "kl": 0.1798277348279953, "learning_rate": 2.889356460089746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8181 }, { "completion_length": 560.25, "epoch": 2.2677383592017737, "grad_norm": 0.0, "kl": 0.20688298344612122, "learning_rate": 2.8889240045790277e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8182 }, { "completion_length": 579.25, "epoch": 2.2680155210643016, "grad_norm": 0.0, "kl": 0.16607308387756348, "learning_rate": 2.888491537141629e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8183 }, { "completion_length": 603.5, "epoch": 2.2682926829268295, "grad_norm": 0.0, "kl": 145.2934112548828, "learning_rate": 2.88805905779081e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8184 }, { "completion_length": 598.25, "epoch": 2.268569844789357, "grad_norm": 0.0, "kl": 0.12565085291862488, "learning_rate": 2.8876265665398345e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8185 }, { "completion_length": 595.5, "epoch": 2.268847006651885, "grad_norm": 0.0, "kl": 0.14447717368602753, "learning_rate": 2.887194063401965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8186 }, { "completion_length": 513.25, "epoch": 2.2691241685144123, "grad_norm": 0.0, "kl": 0.258011132478714, "learning_rate": 2.886761548390463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8187 }, { "completion_length": 536.25, "epoch": 2.2694013303769403, "grad_norm": 0.0, "kl": 0.14497597515583038, "learning_rate": 2.8863290215185946e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8188 }, { "completion_length": 548.75, "epoch": 2.2696784922394677, "grad_norm": 0.46639618277549744, "kl": 4575001088.0, "learning_rate": 2.885896482799621e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8189 }, { "completion_length": 543.0, "epoch": 2.2699556541019956, "grad_norm": 1.705094575881958, "kl": 479142848.0, "learning_rate": 2.8854639322468082e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8190 }, { "completion_length": 586.0, "epoch": 2.270232815964523, "grad_norm": 0.0, "kl": 0.26638063788414, "learning_rate": 2.8850313698734202e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8191 }, { "completion_length": 511.25, "epoch": 2.270509977827051, "grad_norm": 0.0, "kl": 0.16900429129600525, "learning_rate": 2.884598795692722e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8192 }, { "completion_length": 722.25, "epoch": 2.270787139689579, "grad_norm": 0.3610715866088867, "kl": 7297408512.0, "learning_rate": 2.8841662097179777e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8193 }, { "completion_length": 594.0, "epoch": 2.2710643015521064, "grad_norm": 0.0, "kl": 0.14821304380893707, "learning_rate": 2.883733611962455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8194 }, { "completion_length": 514.0, "epoch": 2.2713414634146343, "grad_norm": 0.0, "kl": 0.18949174880981445, "learning_rate": 2.8833010024394174e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8195 }, { "completion_length": 717.5, "epoch": 2.2716186252771617, "grad_norm": 0.0, "kl": 0.13924968242645264, "learning_rate": 2.882868381162134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8196 }, { "completion_length": 491.75, "epoch": 2.2718957871396896, "grad_norm": 0.0, "kl": 0.2292344719171524, "learning_rate": 2.882435748143869e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8197 }, { "completion_length": 485.25, "epoch": 2.272172949002217, "grad_norm": 0.0, "kl": 0.32500603795051575, "learning_rate": 2.882003103397891e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8198 }, { "completion_length": 562.25, "epoch": 2.272450110864745, "grad_norm": 0.0, "kl": 0.2018926441669464, "learning_rate": 2.8815704469374662e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8199 }, { "completion_length": 579.75, "epoch": 2.2727272727272725, "grad_norm": 0.5844259262084961, "kl": 2783493888.0, "learning_rate": 2.881137778775864e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8200 }, { "completion_length": 595.75, "epoch": 2.2730044345898004, "grad_norm": 0.0, "kl": 0.2051146775484085, "learning_rate": 2.8807050989263507e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8201 }, { "completion_length": 582.75, "epoch": 2.2732815964523283, "grad_norm": 0.0, "kl": 0.14662297070026398, "learning_rate": 2.8802724074021972e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8202 }, { "completion_length": 543.0, "epoch": 2.2735587583148558, "grad_norm": 0.5280228853225708, "kl": 1166.4171142578125, "learning_rate": 2.8798397042166693e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8203 }, { "completion_length": 640.25, "epoch": 2.2738359201773837, "grad_norm": 2.394686222076416, "kl": 0.438892126083374, "learning_rate": 2.8794069893830386e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8204 }, { "completion_length": 619.5, "epoch": 2.274113082039911, "grad_norm": 0.0, "kl": 0.15412701666355133, "learning_rate": 2.8789742629145736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8205 }, { "completion_length": 534.75, "epoch": 2.274390243902439, "grad_norm": 0.0, "kl": 0.2083044797182083, "learning_rate": 2.8785415248245437e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8206 }, { "completion_length": 579.0, "epoch": 2.2746674057649665, "grad_norm": 0.0, "kl": 0.1612800508737564, "learning_rate": 2.878108775126221e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8207 }, { "completion_length": 606.0, "epoch": 2.2749445676274944, "grad_norm": 1.541433334350586, "kl": 438058880.0, "learning_rate": 2.8776760138328745e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8208 }, { "completion_length": 605.25, "epoch": 2.2752217294900223, "grad_norm": 0.0, "kl": 0.13941514492034912, "learning_rate": 2.8772432409577765e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8209 }, { "completion_length": 559.25, "epoch": 2.27549889135255, "grad_norm": 0.0, "kl": 0.14943937957286835, "learning_rate": 2.876810456514197e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8210 }, { "completion_length": 514.0, "epoch": 2.2757760532150777, "grad_norm": 0.0, "kl": 0.2081037014722824, "learning_rate": 2.8763776605154085e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8211 }, { "completion_length": 632.0, "epoch": 2.276053215077605, "grad_norm": 0.0, "kl": 0.12906184792518616, "learning_rate": 2.8759448529746824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8212 }, { "completion_length": 638.5, "epoch": 2.276330376940133, "grad_norm": 0.0, "kl": 0.15547527372837067, "learning_rate": 2.8755120339052916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8213 }, { "completion_length": 601.5, "epoch": 2.2766075388026605, "grad_norm": 0.0, "kl": 0.14616988599300385, "learning_rate": 2.8750792033205086e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8214 }, { "completion_length": 563.0, "epoch": 2.2768847006651884, "grad_norm": 0.0, "kl": 0.17793108522891998, "learning_rate": 2.8746463612336078e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8215 }, { "completion_length": 622.0, "epoch": 2.2771618625277164, "grad_norm": 0.0, "kl": 0.18404816091060638, "learning_rate": 2.8742135076578608e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8216 }, { "completion_length": 561.25, "epoch": 2.277439024390244, "grad_norm": 3.948862075805664, "kl": 58725716.0, "learning_rate": 2.8737806426065427e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8217 }, { "completion_length": 676.75, "epoch": 2.2777161862527717, "grad_norm": 0.0, "kl": 0.16416144371032715, "learning_rate": 2.873347766092927e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8218 }, { "completion_length": 579.75, "epoch": 2.277993348115299, "grad_norm": 0.0, "kl": 0.17209988832473755, "learning_rate": 2.872914878130288e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8219 }, { "completion_length": 646.75, "epoch": 2.278270509977827, "grad_norm": 0.0, "kl": 0.1569019854068756, "learning_rate": 2.872481978731901e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8220 }, { "completion_length": 507.5, "epoch": 2.2785476718403546, "grad_norm": 0.0, "kl": 0.15708167850971222, "learning_rate": 2.872049067911042e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8221 }, { "completion_length": 539.75, "epoch": 2.2788248337028825, "grad_norm": 0.5966886878013611, "kl": 168152.90625, "learning_rate": 2.871616145680985e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8222 }, { "completion_length": 734.75, "epoch": 2.2791019955654104, "grad_norm": 0.0, "kl": 0.15078005194664001, "learning_rate": 2.8711832120550065e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8223 }, { "completion_length": 592.5, "epoch": 2.279379157427938, "grad_norm": 0.0, "kl": 0.15484386682510376, "learning_rate": 2.8707502670463838e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8224 }, { "completion_length": 445.75, "epoch": 2.2796563192904657, "grad_norm": 0.0, "kl": 0.4974767565727234, "learning_rate": 2.870317310668392e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8225 }, { "completion_length": 608.75, "epoch": 2.279933481152993, "grad_norm": 0.0, "kl": 0.17178714275360107, "learning_rate": 2.869884342934309e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8226 }, { "completion_length": 562.5, "epoch": 2.280210643015521, "grad_norm": 0.0, "kl": 0.17917485535144806, "learning_rate": 2.8694513638574117e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8227 }, { "completion_length": 512.5, "epoch": 2.2804878048780486, "grad_norm": 0.0, "kl": 0.17451603710651398, "learning_rate": 2.8690183734509786e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8228 }, { "completion_length": 587.0, "epoch": 2.2807649667405765, "grad_norm": 0.0, "kl": 687753396224.0, "learning_rate": 2.8685853717282865e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8229 }, { "completion_length": 594.0, "epoch": 2.2810421286031044, "grad_norm": 0.0, "kl": 0.2075018733739853, "learning_rate": 2.868152358702615e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8230 }, { "completion_length": 604.75, "epoch": 2.281319290465632, "grad_norm": 0.0, "kl": 0.15239420533180237, "learning_rate": 2.867719334387241e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8231 }, { "completion_length": 651.5, "epoch": 2.2815964523281598, "grad_norm": 0.0, "kl": 0.14574821293354034, "learning_rate": 2.867286298795446e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8232 }, { "completion_length": 595.75, "epoch": 2.2818736141906872, "grad_norm": 0.0, "kl": 0.1537865549325943, "learning_rate": 2.866853251940507e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8233 }, { "completion_length": 617.75, "epoch": 2.282150776053215, "grad_norm": 1.5254119634628296, "kl": 17570.802734375, "learning_rate": 2.8664201938357052e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8234 }, { "completion_length": 516.5, "epoch": 2.2824279379157426, "grad_norm": 0.0, "kl": 0.1655641347169876, "learning_rate": 2.86598712449432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8235 }, { "completion_length": 526.5, "epoch": 2.2827050997782705, "grad_norm": 0.0, "kl": 0.16089817881584167, "learning_rate": 2.8655540439296326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8236 }, { "completion_length": 667.25, "epoch": 2.2829822616407984, "grad_norm": 0.0, "kl": 0.1447252333164215, "learning_rate": 2.8651209521549233e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8237 }, { "completion_length": 623.25, "epoch": 2.283259423503326, "grad_norm": 0.0, "kl": 0.1555570811033249, "learning_rate": 2.864687849183473e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8238 }, { "completion_length": 588.5, "epoch": 2.283536585365854, "grad_norm": 0.0, "kl": 0.2509802579879761, "learning_rate": 2.8642547350285637e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8239 }, { "completion_length": 542.5, "epoch": 2.2838137472283813, "grad_norm": 0.0, "kl": 0.19232246279716492, "learning_rate": 2.8638216097034773e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8240 }, { "completion_length": 538.5, "epoch": 2.284090909090909, "grad_norm": 0.0, "kl": 0.17867036163806915, "learning_rate": 2.863388473221495e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8241 }, { "completion_length": 639.75, "epoch": 2.2843680709534366, "grad_norm": 0.42809316515922546, "kl": 0.11413778364658356, "learning_rate": 2.862955325595899e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8242 }, { "completion_length": 592.0, "epoch": 2.2846452328159645, "grad_norm": 0.0, "kl": 0.16920267045497894, "learning_rate": 2.862522166839975e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8243 }, { "completion_length": 645.0, "epoch": 2.2849223946784925, "grad_norm": 0.0, "kl": 4445626368.0, "learning_rate": 2.862088996967003e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8244 }, { "completion_length": 507.0, "epoch": 2.28519955654102, "grad_norm": 0.0, "kl": 0.1659899652004242, "learning_rate": 2.861655815990268e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8245 }, { "completion_length": 594.5, "epoch": 2.285476718403548, "grad_norm": 0.0, "kl": 0.1436522901058197, "learning_rate": 2.8612226239230536e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8246 }, { "completion_length": 595.5, "epoch": 2.2857538802660753, "grad_norm": 0.0, "kl": 0.17700642347335815, "learning_rate": 2.860789420778644e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8247 }, { "completion_length": 523.25, "epoch": 2.286031042128603, "grad_norm": 0.0, "kl": 0.14363577961921692, "learning_rate": 2.8603562065703237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8248 }, { "completion_length": 610.5, "epoch": 2.2863082039911307, "grad_norm": 0.0, "kl": 0.18659336864948273, "learning_rate": 2.8599229813113775e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8249 }, { "completion_length": 578.75, "epoch": 2.2865853658536586, "grad_norm": 0.3487418591976166, "kl": 1005231.75, "learning_rate": 2.8594897450150906e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8250 }, { "completion_length": 573.5, "epoch": 2.2868625277161865, "grad_norm": 0.0, "kl": 0.17530623078346252, "learning_rate": 2.8590564976947486e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8251 }, { "completion_length": 617.5, "epoch": 2.287139689578714, "grad_norm": 0.0, "kl": 0.1381560117006302, "learning_rate": 2.8586232393636378e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8252 }, { "completion_length": 614.25, "epoch": 2.287416851441242, "grad_norm": 0.0, "kl": 0.12913735210895538, "learning_rate": 2.8581899700350437e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8253 }, { "completion_length": 638.75, "epoch": 2.2876940133037693, "grad_norm": 0.0, "kl": 0.17445465922355652, "learning_rate": 2.8577566897222534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8254 }, { "completion_length": 616.25, "epoch": 2.287971175166297, "grad_norm": 0.0, "kl": 0.21677319705486298, "learning_rate": 2.857323398438554e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8255 }, { "completion_length": 616.5, "epoch": 2.2882483370288247, "grad_norm": 0.0, "kl": 0.2951473295688629, "learning_rate": 2.8568900961972322e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8256 }, { "completion_length": 635.75, "epoch": 2.2885254988913526, "grad_norm": 0.0, "kl": 0.4643510580062866, "learning_rate": 2.8564567830115753e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8257 }, { "completion_length": 579.5, "epoch": 2.2888026607538805, "grad_norm": 0.0, "kl": 937430876160.0, "learning_rate": 2.8560234588948725e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8258 }, { "completion_length": 571.5, "epoch": 2.289079822616408, "grad_norm": 0.0, "kl": 0.18652646243572235, "learning_rate": 2.8555901238604107e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8259 }, { "completion_length": 596.5, "epoch": 2.289356984478936, "grad_norm": 0.0, "kl": 0.21788989007472992, "learning_rate": 2.8551567779214795e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8260 }, { "completion_length": 548.25, "epoch": 2.2896341463414633, "grad_norm": 0.0, "kl": 0.3859463930130005, "learning_rate": 2.8547234210913666e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8261 }, { "completion_length": 626.25, "epoch": 2.2899113082039912, "grad_norm": 0.0, "kl": 0.15623383224010468, "learning_rate": 2.854290053383363e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8262 }, { "completion_length": 516.5, "epoch": 2.2901884700665187, "grad_norm": 0.0, "kl": 0.17081427574157715, "learning_rate": 2.8538566748107567e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8263 }, { "completion_length": 975.25, "epoch": 2.2904656319290466, "grad_norm": 0.0, "kl": 0.14343903958797455, "learning_rate": 2.8534232853868384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8264 }, { "completion_length": 533.0, "epoch": 2.2907427937915745, "grad_norm": 0.0, "kl": 0.17179477214813232, "learning_rate": 2.8529898851248972e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8265 }, { "completion_length": 559.0, "epoch": 2.291019955654102, "grad_norm": 0.9022113084793091, "kl": 240043638784.0, "learning_rate": 2.8525564740382256e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8266 }, { "completion_length": 509.75, "epoch": 2.29129711751663, "grad_norm": 0.0, "kl": 0.17209483683109283, "learning_rate": 2.8521230521401128e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8267 }, { "completion_length": 557.5, "epoch": 2.2915742793791574, "grad_norm": 0.0, "kl": 0.18707910180091858, "learning_rate": 2.8516896194438515e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8268 }, { "completion_length": 555.25, "epoch": 2.2918514412416853, "grad_norm": 0.0, "kl": 0.20136408507823944, "learning_rate": 2.8512561759627322e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8269 }, { "completion_length": 561.0, "epoch": 2.2921286031042127, "grad_norm": 0.0, "kl": 0.2400488406419754, "learning_rate": 2.850822721710047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8270 }, { "completion_length": 634.0, "epoch": 2.2924057649667406, "grad_norm": 0.0, "kl": 0.14767129719257355, "learning_rate": 2.8503892566990887e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8271 }, { "completion_length": 522.5, "epoch": 2.292682926829268, "grad_norm": 0.0, "kl": 0.16104722023010254, "learning_rate": 2.8499557809431495e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8272 }, { "completion_length": 509.75, "epoch": 2.292960088691796, "grad_norm": 0.0, "kl": 0.16159051656723022, "learning_rate": 2.849522294455522e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8273 }, { "completion_length": 530.25, "epoch": 2.2932372505543235, "grad_norm": 0.0, "kl": 0.1761350929737091, "learning_rate": 2.8490887972494998e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8274 }, { "completion_length": 711.75, "epoch": 2.2935144124168514, "grad_norm": 0.0, "kl": 0.19008485972881317, "learning_rate": 2.848655289338377e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8275 }, { "completion_length": 547.75, "epoch": 2.2937915742793793, "grad_norm": 0.0, "kl": 0.8166060447692871, "learning_rate": 2.8482217707354453e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8276 }, { "completion_length": 604.75, "epoch": 2.2940687361419068, "grad_norm": 0.6930062770843506, "kl": 9504739328.0, "learning_rate": 2.847788241454002e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8277 }, { "completion_length": 598.75, "epoch": 2.2943458980044347, "grad_norm": 2.0156939029693604, "kl": 2088.621826171875, "learning_rate": 2.8473547015073392e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8278 }, { "completion_length": 600.5, "epoch": 2.294623059866962, "grad_norm": 0.0, "kl": 0.19638848304748535, "learning_rate": 2.8469211509087533e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8279 }, { "completion_length": 537.0, "epoch": 2.29490022172949, "grad_norm": 0.0, "kl": 0.20129238069057465, "learning_rate": 2.8464875896715383e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8280 }, { "completion_length": 547.25, "epoch": 2.2951773835920175, "grad_norm": 0.0, "kl": 0.18228165805339813, "learning_rate": 2.8460540178089907e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8281 }, { "completion_length": 550.25, "epoch": 2.2954545454545454, "grad_norm": 0.0, "kl": 0.18189239501953125, "learning_rate": 2.8456204353344052e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8282 }, { "completion_length": 509.0, "epoch": 2.2957317073170733, "grad_norm": 0.0, "kl": 0.17538420855998993, "learning_rate": 2.84518684226108e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8283 }, { "completion_length": 582.0, "epoch": 2.296008869179601, "grad_norm": 0.0, "kl": 0.18285124003887177, "learning_rate": 2.8447532386023093e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8284 }, { "completion_length": 597.25, "epoch": 2.2962860310421287, "grad_norm": 0.0, "kl": 0.17788776755332947, "learning_rate": 2.844319624371391e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8285 }, { "completion_length": 642.75, "epoch": 2.296563192904656, "grad_norm": 0.3438549041748047, "kl": 8782341120.0, "learning_rate": 2.8438859995816225e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8286 }, { "completion_length": 597.75, "epoch": 2.296840354767184, "grad_norm": 5.318035125732422, "kl": 146601.296875, "learning_rate": 2.8434523642463006e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8287 }, { "completion_length": 622.5, "epoch": 2.2971175166297115, "grad_norm": 0.0, "kl": 0.22072651982307434, "learning_rate": 2.8430187183787233e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8288 }, { "completion_length": 579.5, "epoch": 2.2973946784922394, "grad_norm": 0.0, "kl": 0.22357842326164246, "learning_rate": 2.842585061992189e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8289 }, { "completion_length": 629.5, "epoch": 2.2976718403547673, "grad_norm": 0.0, "kl": 0.1618850976228714, "learning_rate": 2.842151395099996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8290 }, { "completion_length": 602.0, "epoch": 2.297949002217295, "grad_norm": 0.0, "kl": 0.16996648907661438, "learning_rate": 2.841717717715443e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8291 }, { "completion_length": 685.25, "epoch": 2.2982261640798227, "grad_norm": 0.0, "kl": 0.15844838321208954, "learning_rate": 2.8412840298518295e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8292 }, { "completion_length": 556.75, "epoch": 2.29850332594235, "grad_norm": 0.0, "kl": 0.16682039201259613, "learning_rate": 2.840850331522454e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8293 }, { "completion_length": 603.75, "epoch": 2.298780487804878, "grad_norm": 0.8089328408241272, "kl": 0.14457780122756958, "learning_rate": 2.840416622740617e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8294 }, { "completion_length": 524.25, "epoch": 2.2990576496674056, "grad_norm": 0.0, "kl": 0.18916475772857666, "learning_rate": 2.8399829035196173e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8295 }, { "completion_length": 599.0, "epoch": 2.2993348115299335, "grad_norm": 1.323427438735962, "kl": 10323606.0, "learning_rate": 2.839549173872758e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8296 }, { "completion_length": 594.25, "epoch": 2.2996119733924614, "grad_norm": 0.0, "kl": 0.17617987096309662, "learning_rate": 2.839115433813337e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8297 }, { "completion_length": 653.0, "epoch": 2.299889135254989, "grad_norm": 0.0, "kl": 0.18818435072898865, "learning_rate": 2.8386816833546567e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8298 }, { "completion_length": 586.5, "epoch": 2.3001662971175167, "grad_norm": 0.0, "kl": 0.15232501924037933, "learning_rate": 2.8382479225100178e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8299 }, { "completion_length": 570.5, "epoch": 2.300443458980044, "grad_norm": 0.0, "kl": 0.16576431691646576, "learning_rate": 2.8378141512927214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8300 }, { "completion_length": 739.5, "epoch": 2.300720620842572, "grad_norm": 0.7472550868988037, "kl": 1164607356928.0, "learning_rate": 2.837380369716072e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8301 }, { "completion_length": 563.0, "epoch": 2.3009977827050996, "grad_norm": 0.0, "kl": 0.17819999158382416, "learning_rate": 2.836946577793369e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8302 }, { "completion_length": 632.75, "epoch": 2.3012749445676275, "grad_norm": 0.453400582075119, "kl": 0.1833430528640747, "learning_rate": 2.8365127755379165e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8303 }, { "completion_length": 603.25, "epoch": 2.3015521064301554, "grad_norm": 0.6063486337661743, "kl": 125378117632.0, "learning_rate": 2.836078962963016e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8304 }, { "completion_length": 787.5, "epoch": 2.301829268292683, "grad_norm": 0.0, "kl": 0.1393718123435974, "learning_rate": 2.8356451400819727e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8305 }, { "completion_length": 632.5, "epoch": 2.3021064301552108, "grad_norm": 0.0, "kl": 0.18658298254013062, "learning_rate": 2.8352113069080893e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8306 }, { "completion_length": 689.0, "epoch": 2.3023835920177382, "grad_norm": 0.0, "kl": 4.58541202545166, "learning_rate": 2.83477746345467e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8307 }, { "completion_length": 699.25, "epoch": 2.302660753880266, "grad_norm": 0.0, "kl": 0.24311424791812897, "learning_rate": 2.8343436097350173e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8308 }, { "completion_length": 775.75, "epoch": 2.3029379157427936, "grad_norm": 0.0, "kl": 0.1539652943611145, "learning_rate": 2.8339097457624383e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8309 }, { "completion_length": 617.25, "epoch": 2.3032150776053215, "grad_norm": 0.0, "kl": 13462137.0, "learning_rate": 2.8334758715502346e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8310 }, { "completion_length": 596.75, "epoch": 2.3034922394678494, "grad_norm": 0.0, "kl": 0.16294267773628235, "learning_rate": 2.833041987111715e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8311 }, { "completion_length": 707.5, "epoch": 2.303769401330377, "grad_norm": 0.0, "kl": 0.14896292984485626, "learning_rate": 2.832608092460182e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8312 }, { "completion_length": 685.5, "epoch": 2.304046563192905, "grad_norm": 0.0, "kl": 0.1601254940032959, "learning_rate": 2.8321741876089432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8313 }, { "completion_length": 618.5, "epoch": 2.3043237250554323, "grad_norm": 0.0, "kl": 0.1968049257993698, "learning_rate": 2.831740272571303e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8314 }, { "completion_length": 761.75, "epoch": 2.30460088691796, "grad_norm": 0.0, "kl": 0.12589089572429657, "learning_rate": 2.8313063473605694e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8315 }, { "completion_length": 758.75, "epoch": 2.3048780487804876, "grad_norm": 0.0, "kl": 0.1317974030971527, "learning_rate": 2.8308724119900475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8316 }, { "completion_length": 665.5, "epoch": 2.3051552106430155, "grad_norm": 0.0, "kl": 0.13143569231033325, "learning_rate": 2.8304384664730455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8317 }, { "completion_length": 653.75, "epoch": 2.3054323725055434, "grad_norm": 0.0, "kl": 0.14210551977157593, "learning_rate": 2.83000451082287e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8318 }, { "completion_length": 751.5, "epoch": 2.305709534368071, "grad_norm": 0.3267117440700531, "kl": 3616002.25, "learning_rate": 2.82957054505283e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8319 }, { "completion_length": 623.75, "epoch": 2.305986696230599, "grad_norm": 0.0, "kl": 0.7137260437011719, "learning_rate": 2.8291365691762313e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8320 }, { "completion_length": 560.25, "epoch": 2.3062638580931263, "grad_norm": 0.0, "kl": 0.15076225996017456, "learning_rate": 2.8287025832063837e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8321 }, { "completion_length": 726.5, "epoch": 2.306541019955654, "grad_norm": 0.0, "kl": 0.1588258147239685, "learning_rate": 2.828268587156595e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8322 }, { "completion_length": 588.75, "epoch": 2.3068181818181817, "grad_norm": 0.0, "kl": 0.15750785171985626, "learning_rate": 2.8278345810401746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8323 }, { "completion_length": 597.5, "epoch": 2.3070953436807096, "grad_norm": 0.0, "kl": 0.16435833275318146, "learning_rate": 2.8274005648704316e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8324 }, { "completion_length": 658.0, "epoch": 2.3073725055432375, "grad_norm": 0.0, "kl": 0.17250008881092072, "learning_rate": 2.8269665386606736e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8325 }, { "completion_length": 620.5, "epoch": 2.307649667405765, "grad_norm": 0.0, "kl": 0.520532488822937, "learning_rate": 2.826532502424214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8326 }, { "completion_length": 549.5, "epoch": 2.307926829268293, "grad_norm": 0.0, "kl": 0.48128828406333923, "learning_rate": 2.82609845617436e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8327 }, { "completion_length": 620.0, "epoch": 2.3082039911308203, "grad_norm": 0.0, "kl": 0.18669812381267548, "learning_rate": 2.8256643999244234e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8328 }, { "completion_length": 531.0, "epoch": 2.308481152993348, "grad_norm": 0.44505658745765686, "kl": 0.19866646826267242, "learning_rate": 2.8252303336877136e-06, "loss": -0.0, "reward": 1.625, "reward_std": 0.25, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8329 }, { "completion_length": 689.0, "epoch": 2.3087583148558757, "grad_norm": 0.0, "kl": 0.18412533402442932, "learning_rate": 2.8247962574775435e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8330 }, { "completion_length": 533.25, "epoch": 2.3090354767184036, "grad_norm": 0.0, "kl": 0.17715531587600708, "learning_rate": 2.8243621713072224e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8331 }, { "completion_length": 674.0, "epoch": 2.3093126385809315, "grad_norm": 0.0, "kl": 0.1513034850358963, "learning_rate": 2.823928075190063e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8332 }, { "completion_length": 591.75, "epoch": 2.309589800443459, "grad_norm": 0.0, "kl": 0.1849796175956726, "learning_rate": 2.8234939691393765e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8333 }, { "completion_length": 508.5, "epoch": 2.309866962305987, "grad_norm": 0.0, "kl": 0.5317915678024292, "learning_rate": 2.8230598531684766e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8334 }, { "completion_length": 660.0, "epoch": 2.3101441241685143, "grad_norm": 0.0, "kl": 0.15750093758106232, "learning_rate": 2.8226257272906743e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8335 }, { "completion_length": 545.75, "epoch": 2.3104212860310422, "grad_norm": 0.0, "kl": 0.18160736560821533, "learning_rate": 2.822191591519283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8336 }, { "completion_length": 541.0, "epoch": 2.3106984478935697, "grad_norm": 0.0, "kl": 0.18661817908287048, "learning_rate": 2.821757445867616e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8337 }, { "completion_length": 750.25, "epoch": 2.3109756097560976, "grad_norm": 0.0, "kl": 0.1330396831035614, "learning_rate": 2.821323290348987e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8338 }, { "completion_length": 577.75, "epoch": 2.3112527716186255, "grad_norm": 0.0, "kl": 0.18283215165138245, "learning_rate": 2.820889124976709e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8339 }, { "completion_length": 711.75, "epoch": 2.311529933481153, "grad_norm": 0.513620913028717, "kl": 344720832.0, "learning_rate": 2.820454949764096e-06, "loss": 0.0, "reward": 4.625, "reward_std": 2.25, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8340 }, { "completion_length": 602.5, "epoch": 2.311807095343681, "grad_norm": 0.0, "kl": 0.15510310232639313, "learning_rate": 2.820020764724464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8341 }, { "completion_length": 727.5, "epoch": 2.3120842572062084, "grad_norm": 0.0, "kl": 0.2080976665019989, "learning_rate": 2.819586569871125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8342 }, { "completion_length": 607.0, "epoch": 2.3123614190687363, "grad_norm": 0.0, "kl": 0.16448840498924255, "learning_rate": 2.8191523652173965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8343 }, { "completion_length": 600.5, "epoch": 2.3126385809312637, "grad_norm": 2.4544708728790283, "kl": 3940809179136.0, "learning_rate": 2.818718150776592e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8344 }, { "completion_length": 660.0, "epoch": 2.3129157427937916, "grad_norm": 0.0, "kl": 0.14250318706035614, "learning_rate": 2.8182839265620277e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8345 }, { "completion_length": 668.5, "epoch": 2.313192904656319, "grad_norm": 0.0, "kl": 0.14858384430408478, "learning_rate": 2.81784969258702e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8346 }, { "completion_length": 609.0, "epoch": 2.313470066518847, "grad_norm": NaN, "kl": 92935.2265625, "learning_rate": 2.817415448864883e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8347 }, { "completion_length": 534.5, "epoch": 2.3137472283813745, "grad_norm": 0.0, "kl": 0.17957080900669098, "learning_rate": 2.817415448864883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8348 }, { "completion_length": 609.5, "epoch": 2.3140243902439024, "grad_norm": 0.0, "kl": 0.1704520434141159, "learning_rate": 2.8169811954089356e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8349 }, { "completion_length": 593.75, "epoch": 2.3143015521064303, "grad_norm": 0.0, "kl": 310.0550231933594, "learning_rate": 2.8165469322324932e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8350 }, { "completion_length": 610.0, "epoch": 2.3145787139689578, "grad_norm": 0.0, "kl": 0.1545659452676773, "learning_rate": 2.8161126593488737e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8351 }, { "completion_length": 742.75, "epoch": 2.3148558758314857, "grad_norm": 2.164846658706665, "kl": 4742714490880.0, "learning_rate": 2.8156783767713937e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8352 }, { "completion_length": 628.5, "epoch": 2.315133037694013, "grad_norm": 0.0, "kl": 0.23888596892356873, "learning_rate": 2.815244084513371e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8353 }, { "completion_length": 584.25, "epoch": 2.315410199556541, "grad_norm": 0.0, "kl": 0.15699167549610138, "learning_rate": 2.8148097825881236e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8354 }, { "completion_length": 621.5, "epoch": 2.3156873614190685, "grad_norm": 0.0, "kl": 0.9247247576713562, "learning_rate": 2.8143754710089694e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8355 }, { "completion_length": 615.5, "epoch": 2.3159645232815964, "grad_norm": 0.0, "kl": 0.15345124900341034, "learning_rate": 2.8139411497892276e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8356 }, { "completion_length": 685.5, "epoch": 2.3162416851441243, "grad_norm": 0.0, "kl": 0.1306730955839157, "learning_rate": 2.8135068189422172e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8357 }, { "completion_length": 602.0, "epoch": 2.316518847006652, "grad_norm": 0.0, "kl": 0.21889276802539825, "learning_rate": 2.8130724784812563e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8358 }, { "completion_length": 612.25, "epoch": 2.3167960088691797, "grad_norm": 0.0, "kl": 0.15393640100955963, "learning_rate": 2.8126381284196652e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8359 }, { "completion_length": 613.75, "epoch": 2.317073170731707, "grad_norm": 0.0, "kl": 0.15625499188899994, "learning_rate": 2.8122037687707628e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8360 }, { "completion_length": 678.0, "epoch": 2.317350332594235, "grad_norm": 0.0, "kl": 0.16276517510414124, "learning_rate": 2.81176939954787e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8361 }, { "completion_length": 549.0, "epoch": 2.3176274944567625, "grad_norm": 0.0, "kl": 0.1949205994606018, "learning_rate": 2.811335020764306e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8362 }, { "completion_length": 602.75, "epoch": 2.3179046563192904, "grad_norm": 0.0, "kl": 0.1904134750366211, "learning_rate": 2.8109006324333923e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8363 }, { "completion_length": 669.0, "epoch": 2.3181818181818183, "grad_norm": 0.0, "kl": 0.163996160030365, "learning_rate": 2.810466234568449e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8364 }, { "completion_length": 610.5, "epoch": 2.318458980044346, "grad_norm": 1.619895100593567, "kl": 413175.625, "learning_rate": 2.810031827182799e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8365 }, { "completion_length": 638.75, "epoch": 2.3187361419068737, "grad_norm": 0.0, "kl": 0.15800273418426514, "learning_rate": 2.809597410289762e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8366 }, { "completion_length": 609.75, "epoch": 2.319013303769401, "grad_norm": 0.0, "kl": 0.14475105702877045, "learning_rate": 2.8091629839026608e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8367 }, { "completion_length": 706.75, "epoch": 2.319290465631929, "grad_norm": 0.0, "kl": 0.14291998744010925, "learning_rate": 2.8087285480348157e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8368 }, { "completion_length": 728.0, "epoch": 2.3195676274944566, "grad_norm": 0.0, "kl": 0.1424352079629898, "learning_rate": 2.8082941026995512e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8369 }, { "completion_length": 555.5, "epoch": 2.3198447893569845, "grad_norm": 0.0, "kl": 0.22274568676948547, "learning_rate": 2.8078596479101887e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8370 }, { "completion_length": 588.75, "epoch": 2.3201219512195124, "grad_norm": 0.0, "kl": 0.2115219086408615, "learning_rate": 2.8074251836800514e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8371 }, { "completion_length": 609.0, "epoch": 2.32039911308204, "grad_norm": 0.0, "kl": 0.17736588418483734, "learning_rate": 2.806990710022463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8372 }, { "completion_length": 678.25, "epoch": 2.3206762749445677, "grad_norm": 0.0, "kl": 865508720640.0, "learning_rate": 2.8065562269507464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8373 }, { "completion_length": 587.25, "epoch": 2.320953436807095, "grad_norm": 0.0, "kl": 0.1934933066368103, "learning_rate": 2.806121734478226e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8374 }, { "completion_length": 555.5, "epoch": 2.321230598669623, "grad_norm": 0.3797866702079773, "kl": 0.5474745631217957, "learning_rate": 2.805687232618225e-06, "loss": 0.0, "reward": 5.625, "reward_std": 0.25, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.75, "step": 8375 }, { "completion_length": 592.5, "epoch": 2.3215077605321506, "grad_norm": 0.0, "kl": 8297403.0, "learning_rate": 2.8052527213840684e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8376 }, { "completion_length": 616.0, "epoch": 2.3217849223946785, "grad_norm": 0.0, "kl": 0.1545180082321167, "learning_rate": 2.80481820078908e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8377 }, { "completion_length": 546.75, "epoch": 2.3220620842572064, "grad_norm": 0.0, "kl": 0.29756373167037964, "learning_rate": 2.804383670846586e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8378 }, { "completion_length": 563.25, "epoch": 2.322339246119734, "grad_norm": 0.0, "kl": 0.15402579307556152, "learning_rate": 2.803949131569911e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8379 }, { "completion_length": 538.5, "epoch": 2.3226164079822618, "grad_norm": 0.0, "kl": 0.16271935403347015, "learning_rate": 2.8035145829723804e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8380 }, { "completion_length": 567.5, "epoch": 2.3228935698447892, "grad_norm": 0.0, "kl": 0.17183545231819153, "learning_rate": 2.8030800250673194e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8381 }, { "completion_length": 597.0, "epoch": 2.323170731707317, "grad_norm": 0.0, "kl": 0.20279960334300995, "learning_rate": 2.802645457868056e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8382 }, { "completion_length": 604.5, "epoch": 2.3234478935698446, "grad_norm": 0.0, "kl": 0.15034577250480652, "learning_rate": 2.8022108813879153e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8383 }, { "completion_length": 600.75, "epoch": 2.3237250554323725, "grad_norm": 1.6932986974716187, "kl": 0.16439075767993927, "learning_rate": 2.801776295640223e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8384 }, { "completion_length": 612.0, "epoch": 2.3240022172949004, "grad_norm": 0.0, "kl": 0.16284748911857605, "learning_rate": 2.8013417006383078e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8385 }, { "completion_length": 619.0, "epoch": 2.324279379157428, "grad_norm": 0.0, "kl": 0.2936164438724518, "learning_rate": 2.8009070963954967e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8386 }, { "completion_length": 668.25, "epoch": 2.324556541019956, "grad_norm": 0.0, "kl": 0.1481722593307495, "learning_rate": 2.8004724829251154e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8387 }, { "completion_length": 524.25, "epoch": 2.3248337028824833, "grad_norm": 0.0, "kl": 0.17268705368041992, "learning_rate": 2.800037860240494e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8388 }, { "completion_length": 623.0, "epoch": 2.325110864745011, "grad_norm": 0.0, "kl": 0.14215730130672455, "learning_rate": 2.7996032283549594e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8389 }, { "completion_length": 640.5, "epoch": 2.3253880266075386, "grad_norm": 0.0, "kl": 0.17555634677410126, "learning_rate": 2.7991685872818395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8390 }, { "completion_length": 728.0, "epoch": 2.3256651884700665, "grad_norm": 0.0, "kl": 0.2161056399345398, "learning_rate": 2.7987339370344647e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8391 }, { "completion_length": 653.5, "epoch": 2.3259423503325944, "grad_norm": 0.0, "kl": 0.15770214796066284, "learning_rate": 2.798299277626162e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8392 }, { "completion_length": 659.0, "epoch": 2.326219512195122, "grad_norm": 0.0, "kl": 13588389429248.0, "learning_rate": 2.797864609070261e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8393 }, { "completion_length": 668.0, "epoch": 2.32649667405765, "grad_norm": 0.0, "kl": 0.14467951655387878, "learning_rate": 2.7974299313800916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8394 }, { "completion_length": 518.25, "epoch": 2.3267738359201773, "grad_norm": 0.0, "kl": 0.19989971816539764, "learning_rate": 2.796995244568984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8395 }, { "completion_length": 639.25, "epoch": 2.327050997782705, "grad_norm": 0.0, "kl": 0.17064377665519714, "learning_rate": 2.7965605486502677e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8396 }, { "completion_length": 683.0, "epoch": 2.3273281596452327, "grad_norm": 2.875959873199463, "kl": 929.9686279296875, "learning_rate": 2.796125843637273e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8397 }, { "completion_length": 594.5, "epoch": 2.3276053215077606, "grad_norm": 0.0, "kl": 1655142.25, "learning_rate": 2.7956911295433302e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8398 }, { "completion_length": 577.25, "epoch": 2.3278824833702885, "grad_norm": 0.0, "kl": 0.21496695280075073, "learning_rate": 2.7952564063817707e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8399 }, { "completion_length": 646.75, "epoch": 2.328159645232816, "grad_norm": 1.0601004362106323, "kl": 3519.009033203125, "learning_rate": 2.7948216741659253e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8400 }, { "completion_length": 567.75, "epoch": 2.328436807095344, "grad_norm": 0.0, "kl": 0.17756295204162598, "learning_rate": 2.7943869329091263e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8401 }, { "completion_length": 732.0, "epoch": 2.3287139689578713, "grad_norm": 0.0, "kl": 0.3666497766971588, "learning_rate": 2.7939521826247036e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8402 }, { "completion_length": 608.25, "epoch": 2.328991130820399, "grad_norm": 0.0, "kl": 0.1563391089439392, "learning_rate": 2.7935174233259906e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8403 }, { "completion_length": 711.75, "epoch": 2.3292682926829267, "grad_norm": 0.0, "kl": 0.12187852710485458, "learning_rate": 2.7930826550263197e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8404 }, { "completion_length": 671.0, "epoch": 2.3295454545454546, "grad_norm": 0.0, "kl": 0.20065224170684814, "learning_rate": 2.7926478777390228e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8405 }, { "completion_length": 562.75, "epoch": 2.3298226164079825, "grad_norm": 0.0, "kl": 0.15941284596920013, "learning_rate": 2.792213091477433e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8406 }, { "completion_length": 535.25, "epoch": 2.33009977827051, "grad_norm": 0.0, "kl": 0.18166913092136383, "learning_rate": 2.791778296254883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8407 }, { "completion_length": 549.75, "epoch": 2.330376940133038, "grad_norm": 0.0, "kl": 0.20106667280197144, "learning_rate": 2.791343492084706e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8408 }, { "completion_length": 602.0, "epoch": 2.3306541019955653, "grad_norm": 0.0, "kl": 3801.89111328125, "learning_rate": 2.7909086789802363e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8409 }, { "completion_length": 654.0, "epoch": 2.3309312638580932, "grad_norm": 0.0, "kl": 0.14819039404392242, "learning_rate": 2.790473856954808e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8410 }, { "completion_length": 469.25, "epoch": 2.3312084257206207, "grad_norm": 0.0, "kl": 0.548600971698761, "learning_rate": 2.7900390260217546e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8411 }, { "completion_length": 534.25, "epoch": 2.3314855875831486, "grad_norm": 0.0, "kl": 0.18704719841480255, "learning_rate": 2.789604186194411e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8412 }, { "completion_length": 615.25, "epoch": 2.3317627494456765, "grad_norm": 0.0, "kl": 0.2041175365447998, "learning_rate": 2.7891693374861113e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8413 }, { "completion_length": 513.5, "epoch": 2.332039911308204, "grad_norm": 0.0, "kl": 0.17668774724006653, "learning_rate": 2.788734479910192e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8414 }, { "completion_length": 589.75, "epoch": 2.332317073170732, "grad_norm": 0.0, "kl": 0.15997105836868286, "learning_rate": 2.7882996134799854e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8415 }, { "completion_length": 603.25, "epoch": 2.3325942350332594, "grad_norm": 0.0, "kl": 0.16570092737674713, "learning_rate": 2.7878647382088303e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8416 }, { "completion_length": 562.75, "epoch": 2.3328713968957873, "grad_norm": 0.0, "kl": 0.17826692759990692, "learning_rate": 2.7874298541100608e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8417 }, { "completion_length": 591.0, "epoch": 2.3331485587583147, "grad_norm": 0.0, "kl": 0.18565551936626434, "learning_rate": 2.786994961197014e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8418 }, { "completion_length": 622.25, "epoch": 2.3334257206208426, "grad_norm": 0.0, "kl": 635976.3125, "learning_rate": 2.7865600594830245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8419 }, { "completion_length": 528.75, "epoch": 2.33370288248337, "grad_norm": 0.0, "kl": 0.18654251098632812, "learning_rate": 2.7861251489814305e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8420 }, { "completion_length": 593.75, "epoch": 2.333980044345898, "grad_norm": 0.0, "kl": 0.1653163582086563, "learning_rate": 2.7856902297055686e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8421 }, { "completion_length": 479.25, "epoch": 2.3342572062084255, "grad_norm": 0.0, "kl": 2136.18994140625, "learning_rate": 2.785255301668776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8422 }, { "completion_length": 570.75, "epoch": 2.3345343680709534, "grad_norm": 0.0, "kl": 0.20108462870121002, "learning_rate": 2.784820364884389e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8423 }, { "completion_length": 613.0, "epoch": 2.3348115299334813, "grad_norm": 0.0, "kl": 0.165830597281456, "learning_rate": 2.7843854193657473e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8424 }, { "completion_length": 595.0, "epoch": 2.3350886917960088, "grad_norm": 0.0, "kl": 0.1816173940896988, "learning_rate": 2.7839504651261873e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8425 }, { "completion_length": 572.75, "epoch": 2.3353658536585367, "grad_norm": 0.0, "kl": 0.17370270192623138, "learning_rate": 2.7835155021790478e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8426 }, { "completion_length": 577.0, "epoch": 2.335643015521064, "grad_norm": 0.0, "kl": 0.18561075627803802, "learning_rate": 2.783080530537668e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8427 }, { "completion_length": 596.25, "epoch": 2.335920177383592, "grad_norm": 0.0, "kl": 85.44944763183594, "learning_rate": 2.782645550215385e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8428 }, { "completion_length": 663.0, "epoch": 2.3361973392461195, "grad_norm": 0.0, "kl": 0.17929889261722565, "learning_rate": 2.782210561225539e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8429 }, { "completion_length": 495.25, "epoch": 2.3364745011086474, "grad_norm": 0.0, "kl": 0.21564050018787384, "learning_rate": 2.781775563581468e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8430 }, { "completion_length": 652.25, "epoch": 2.3367516629711753, "grad_norm": 0.0, "kl": 0.16404257714748383, "learning_rate": 2.781340557296514e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8431 }, { "completion_length": 586.75, "epoch": 2.337028824833703, "grad_norm": 0.0, "kl": 0.17384755611419678, "learning_rate": 2.7809055423840154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8432 }, { "completion_length": 547.75, "epoch": 2.3373059866962307, "grad_norm": 0.0, "kl": 0.1934518665075302, "learning_rate": 2.7804705188573124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8433 }, { "completion_length": 560.5, "epoch": 2.337583148558758, "grad_norm": 0.0, "kl": 0.2285163551568985, "learning_rate": 2.7800354867297445e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8434 }, { "completion_length": 606.25, "epoch": 2.337860310421286, "grad_norm": 0.0, "kl": 0.20072726905345917, "learning_rate": 2.779600446014653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8435 }, { "completion_length": 500.5, "epoch": 2.3381374722838135, "grad_norm": 0.0, "kl": 98908.7578125, "learning_rate": 2.7791653967253802e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8436 }, { "completion_length": 567.75, "epoch": 2.3384146341463414, "grad_norm": 0.0, "kl": 0.19126483798027039, "learning_rate": 2.7787303388752653e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8437 }, { "completion_length": 627.5, "epoch": 2.3386917960088693, "grad_norm": 0.0, "kl": 0.14461301267147064, "learning_rate": 2.7782952724776503e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8438 }, { "completion_length": 640.75, "epoch": 2.338968957871397, "grad_norm": 0.0, "kl": 0.16369794309139252, "learning_rate": 2.7778601975458764e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8439 }, { "completion_length": 574.25, "epoch": 2.3392461197339247, "grad_norm": 0.0, "kl": 0.19909222424030304, "learning_rate": 2.7774251140932873e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8440 }, { "completion_length": 556.5, "epoch": 2.339523281596452, "grad_norm": 0.0, "kl": 0.21705196797847748, "learning_rate": 2.7769900221332237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8441 }, { "completion_length": 655.75, "epoch": 2.33980044345898, "grad_norm": 0.0, "kl": 0.15333431959152222, "learning_rate": 2.7765549216790283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8442 }, { "completion_length": 571.25, "epoch": 2.3400776053215075, "grad_norm": 0.0, "kl": 0.18868301808834076, "learning_rate": 2.776119812744043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8443 }, { "completion_length": 525.0, "epoch": 2.3403547671840355, "grad_norm": 0.0, "kl": 0.18324407935142517, "learning_rate": 2.7756846953416135e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8444 }, { "completion_length": 500.0, "epoch": 2.3406319290465634, "grad_norm": 0.0, "kl": 0.18891683220863342, "learning_rate": 2.775249569485079e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8445 }, { "completion_length": 597.25, "epoch": 2.340909090909091, "grad_norm": 0.0, "kl": 0.14246726036071777, "learning_rate": 2.7748144351877864e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8446 }, { "completion_length": 590.5, "epoch": 2.3411862527716187, "grad_norm": 0.0, "kl": 0.19696204364299774, "learning_rate": 2.7743792924630788e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8447 }, { "completion_length": 590.5, "epoch": 2.341463414634146, "grad_norm": 0.0, "kl": 0.1778683364391327, "learning_rate": 2.773944141324299e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8448 }, { "completion_length": 542.75, "epoch": 2.341740576496674, "grad_norm": 0.0, "kl": 0.591325044631958, "learning_rate": 2.773508981784792e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8449 }, { "completion_length": 546.5, "epoch": 2.3420177383592016, "grad_norm": 0.0, "kl": 582.22021484375, "learning_rate": 2.773073813857902e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8450 }, { "completion_length": 538.25, "epoch": 2.3422949002217295, "grad_norm": 0.0, "kl": 0.15450339019298553, "learning_rate": 2.7726386375569748e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8451 }, { "completion_length": 525.75, "epoch": 2.3425720620842574, "grad_norm": 0.0, "kl": 0.3517889380455017, "learning_rate": 2.7722034528953546e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8452 }, { "completion_length": 538.25, "epoch": 2.342849223946785, "grad_norm": 0.0, "kl": 0.1813112497329712, "learning_rate": 2.771768259886386e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8453 }, { "completion_length": 692.0, "epoch": 2.3431263858093128, "grad_norm": 0.0, "kl": 0.1548030972480774, "learning_rate": 2.771333058543416e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8454 }, { "completion_length": 590.0, "epoch": 2.3434035476718402, "grad_norm": 0.0, "kl": 0.15005289018154144, "learning_rate": 2.77089784887979e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8455 }, { "completion_length": 586.75, "epoch": 2.343680709534368, "grad_norm": 0.0, "kl": 0.15345367789268494, "learning_rate": 2.770462630908854e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8456 }, { "completion_length": 597.0, "epoch": 2.3439578713968956, "grad_norm": 0.0, "kl": 0.20001834630966187, "learning_rate": 2.770027404643954e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8457 }, { "completion_length": 584.0, "epoch": 2.3442350332594235, "grad_norm": 0.0, "kl": 0.1598275601863861, "learning_rate": 2.769592170098436e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8458 }, { "completion_length": 685.25, "epoch": 2.3445121951219514, "grad_norm": 0.0, "kl": 0.15993501245975494, "learning_rate": 2.769156927285649e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8459 }, { "completion_length": 664.0, "epoch": 2.344789356984479, "grad_norm": 0.0, "kl": 0.15997932851314545, "learning_rate": 2.7687216762189374e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8460 }, { "completion_length": 500.0, "epoch": 2.345066518847007, "grad_norm": 0.0, "kl": 0.48267287015914917, "learning_rate": 2.7682864169116506e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8461 }, { "completion_length": 744.25, "epoch": 2.3453436807095343, "grad_norm": 0.9505440592765808, "kl": 973.3838500976562, "learning_rate": 2.7678511493771354e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8462 }, { "completion_length": 621.5, "epoch": 2.345620842572062, "grad_norm": 0.0, "kl": 0.1584567278623581, "learning_rate": 2.7674158736287397e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8463 }, { "completion_length": 594.25, "epoch": 2.3458980044345896, "grad_norm": 0.0, "kl": 0.25850170850753784, "learning_rate": 2.766980589679812e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8464 }, { "completion_length": 571.0, "epoch": 2.3461751662971175, "grad_norm": 0.0, "kl": 0.17654533684253693, "learning_rate": 2.7665452975436994e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8465 }, { "completion_length": 610.5, "epoch": 2.3464523281596454, "grad_norm": 0.0, "kl": 0.17939947545528412, "learning_rate": 2.7661099972337517e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8466 }, { "completion_length": 619.0, "epoch": 2.346729490022173, "grad_norm": 0.0, "kl": 0.16533483564853668, "learning_rate": 2.7656746887633174e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8467 }, { "completion_length": 558.25, "epoch": 2.347006651884701, "grad_norm": 0.0, "kl": 0.46503323316574097, "learning_rate": 2.765239372145745e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8468 }, { "completion_length": 644.75, "epoch": 2.3472838137472283, "grad_norm": 0.4246183931827545, "kl": 1111125.5, "learning_rate": 2.764804047394385e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8469 }, { "completion_length": 602.75, "epoch": 2.347560975609756, "grad_norm": 1.5299620628356934, "kl": 64120103632896.0, "learning_rate": 2.7643687145225863e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8470 }, { "completion_length": 537.75, "epoch": 2.3478381374722836, "grad_norm": 0.0, "kl": 0.17722104489803314, "learning_rate": 2.763933373543699e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8471 }, { "completion_length": 701.5, "epoch": 2.3481152993348116, "grad_norm": 0.0, "kl": 0.1628594696521759, "learning_rate": 2.763498024471073e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8472 }, { "completion_length": 575.5, "epoch": 2.3483924611973395, "grad_norm": 0.48254629969596863, "kl": 2819167879168.0, "learning_rate": 2.763062667318059e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8473 }, { "completion_length": 639.25, "epoch": 2.348669623059867, "grad_norm": 0.0, "kl": 78004.0859375, "learning_rate": 2.762627302098007e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8474 }, { "completion_length": 754.75, "epoch": 2.348946784922395, "grad_norm": 0.0, "kl": 0.16025632619857788, "learning_rate": 2.762191928824267e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8475 }, { "completion_length": 615.25, "epoch": 2.3492239467849223, "grad_norm": 0.0, "kl": 0.1815459132194519, "learning_rate": 2.7617565475101934e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8476 }, { "completion_length": 600.5, "epoch": 2.34950110864745, "grad_norm": 0.0, "kl": 0.22181282937526703, "learning_rate": 2.761321158169134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8477 }, { "completion_length": 636.0, "epoch": 2.3497782705099777, "grad_norm": 0.0, "kl": 0.195576012134552, "learning_rate": 2.760885760814442e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8478 }, { "completion_length": 740.0, "epoch": 2.3500554323725056, "grad_norm": 0.0, "kl": 0.37210917472839355, "learning_rate": 2.7604503554594693e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8479 }, { "completion_length": 592.25, "epoch": 2.3503325942350335, "grad_norm": 0.0, "kl": 0.17518138885498047, "learning_rate": 2.7600149421175678e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8480 }, { "completion_length": 613.5, "epoch": 2.350609756097561, "grad_norm": 0.0, "kl": 0.18471504747867584, "learning_rate": 2.7595795208020886e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8481 }, { "completion_length": 628.5, "epoch": 2.350886917960089, "grad_norm": 0.0, "kl": 0.160867840051651, "learning_rate": 2.759144091526386e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8482 }, { "completion_length": 643.75, "epoch": 2.3511640798226163, "grad_norm": 1.0383464097976685, "kl": 67.90650939941406, "learning_rate": 2.7587086543038123e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8483 }, { "completion_length": 692.5, "epoch": 2.3514412416851442, "grad_norm": 0.0, "kl": 0.1592044085264206, "learning_rate": 2.7582732091477205e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8484 }, { "completion_length": 463.75, "epoch": 2.3517184035476717, "grad_norm": 0.0, "kl": 0.18713712692260742, "learning_rate": 2.757837756071463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8485 }, { "completion_length": 550.5, "epoch": 2.3519955654101996, "grad_norm": 0.0, "kl": 0.44458717107772827, "learning_rate": 2.7574022950883943e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8486 }, { "completion_length": 609.25, "epoch": 2.3522727272727275, "grad_norm": 0.0, "kl": 0.4168698191642761, "learning_rate": 2.7569668262118685e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8487 }, { "completion_length": 563.0, "epoch": 2.352549889135255, "grad_norm": 2.8727614879608154, "kl": 11168.669921875, "learning_rate": 2.7565313494552386e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8488 }, { "completion_length": 627.75, "epoch": 2.352827050997783, "grad_norm": 0.4100586473941803, "kl": 45049064718336.0, "learning_rate": 2.7560958648318605e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8489 }, { "completion_length": 658.75, "epoch": 2.3531042128603104, "grad_norm": 0.0, "kl": 0.19993466138839722, "learning_rate": 2.7556603723550855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8490 }, { "completion_length": 673.75, "epoch": 2.3533813747228383, "grad_norm": 0.0, "kl": 0.16535136103630066, "learning_rate": 2.755224872038272e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8491 }, { "completion_length": 620.25, "epoch": 2.3536585365853657, "grad_norm": 0.0, "kl": 0.16932234168052673, "learning_rate": 2.7547893638947725e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8492 }, { "completion_length": 609.25, "epoch": 2.3539356984478936, "grad_norm": 0.0, "kl": 0.16591010987758636, "learning_rate": 2.754353847937944e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8493 }, { "completion_length": 629.25, "epoch": 2.354212860310421, "grad_norm": 0.0, "kl": 0.2617187201976776, "learning_rate": 2.7539183241811405e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8494 }, { "completion_length": 609.5, "epoch": 2.354490022172949, "grad_norm": 0.0, "kl": 0.18737706542015076, "learning_rate": 2.753482792637719e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8495 }, { "completion_length": 723.25, "epoch": 2.354767184035477, "grad_norm": 0.0, "kl": 0.22758769989013672, "learning_rate": 2.7530472533210335e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8496 }, { "completion_length": 656.75, "epoch": 2.3550443458980044, "grad_norm": 0.0, "kl": 0.13664698600769043, "learning_rate": 2.752611706244442e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8497 }, { "completion_length": 621.75, "epoch": 2.3553215077605323, "grad_norm": 0.0, "kl": 0.20188985764980316, "learning_rate": 2.7521761514213003e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8498 }, { "completion_length": 645.5, "epoch": 2.3555986696230597, "grad_norm": 3.3304667472839355, "kl": 460080992.0, "learning_rate": 2.751740588864965e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8499 }, { "completion_length": 628.5, "epoch": 2.3558758314855877, "grad_norm": 0.0, "kl": 0.19710981845855713, "learning_rate": 2.751305018588793e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8500 }, { "completion_length": 519.25, "epoch": 2.356152993348115, "grad_norm": 0.0, "kl": 0.16885100305080414, "learning_rate": 2.7508694406061413e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8501 }, { "completion_length": 625.75, "epoch": 2.356430155210643, "grad_norm": 0.0, "kl": 0.1651039719581604, "learning_rate": 2.7504338549303684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8502 }, { "completion_length": 602.0, "epoch": 2.3567073170731705, "grad_norm": 4.453011512756348, "kl": 21525.98828125, "learning_rate": 2.74999826157483e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8503 }, { "completion_length": 610.75, "epoch": 2.3569844789356984, "grad_norm": 0.0, "kl": 142308.09375, "learning_rate": 2.749562660552885e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8504 }, { "completion_length": 622.5, "epoch": 2.3572616407982263, "grad_norm": 0.0, "kl": 0.2013012319803238, "learning_rate": 2.7491270518778913e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8505 }, { "completion_length": 686.5, "epoch": 2.3575388026607538, "grad_norm": 0.0, "kl": 0.14911694824695587, "learning_rate": 2.748691435563208e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8506 }, { "completion_length": 496.5, "epoch": 2.3578159645232817, "grad_norm": 0.0, "kl": 0.2243042141199112, "learning_rate": 2.7482558116221918e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8507 }, { "completion_length": 594.25, "epoch": 2.358093126385809, "grad_norm": 0.0, "kl": 0.1893426477909088, "learning_rate": 2.7478201800682036e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8508 }, { "completion_length": 650.0, "epoch": 2.358370288248337, "grad_norm": 0.0, "kl": 0.22440801560878754, "learning_rate": 2.7473845409146003e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8509 }, { "completion_length": 655.5, "epoch": 2.3586474501108645, "grad_norm": 0.0, "kl": 0.30674219131469727, "learning_rate": 2.7469488941747433e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8510 }, { "completion_length": 596.25, "epoch": 2.3589246119733924, "grad_norm": 0.0, "kl": 0.15824978053569794, "learning_rate": 2.746513239861991e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8511 }, { "completion_length": 520.75, "epoch": 2.3592017738359203, "grad_norm": 0.0, "kl": 0.2810860574245453, "learning_rate": 2.746077577989702e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8512 }, { "completion_length": 609.75, "epoch": 2.359478935698448, "grad_norm": 0.0, "kl": 0.21011105179786682, "learning_rate": 2.745641908571238e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8513 }, { "completion_length": 580.75, "epoch": 2.3597560975609757, "grad_norm": 0.39243191480636597, "kl": 3212709088722944.0, "learning_rate": 2.745206231619959e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8514 }, { "completion_length": 697.5, "epoch": 2.360033259423503, "grad_norm": 0.0, "kl": 0.16257444024085999, "learning_rate": 2.7447705471492243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8515 }, { "completion_length": 620.5, "epoch": 2.360310421286031, "grad_norm": 0.0, "kl": 0.17803774774074554, "learning_rate": 2.7443348551723945e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8516 }, { "completion_length": 634.5, "epoch": 2.3605875831485585, "grad_norm": 0.0, "kl": 0.19146256148815155, "learning_rate": 2.743899155702833e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8517 }, { "completion_length": 598.25, "epoch": 2.3608647450110865, "grad_norm": 3.455500364303589, "kl": 541742076002304.0, "learning_rate": 2.7434634487538975e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8518 }, { "completion_length": 571.75, "epoch": 2.3611419068736144, "grad_norm": 3.373779535293579, "kl": 63568569434112.0, "learning_rate": 2.743027734338951e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8519 }, { "completion_length": 690.5, "epoch": 2.361419068736142, "grad_norm": 0.0, "kl": 0.16389614343643188, "learning_rate": 2.7425920124713544e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8520 }, { "completion_length": 656.25, "epoch": 2.3616962305986697, "grad_norm": 0.0, "kl": 0.14923949539661407, "learning_rate": 2.742156283164471e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8521 }, { "completion_length": 600.5, "epoch": 2.361973392461197, "grad_norm": 0.0, "kl": 378103520034816.0, "learning_rate": 2.741720546431661e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8522 }, { "completion_length": 642.5, "epoch": 2.362250554323725, "grad_norm": 0.0, "kl": 0.13511823117733002, "learning_rate": 2.7412848022862883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8523 }, { "completion_length": 566.75, "epoch": 2.3625277161862526, "grad_norm": 0.0, "kl": 0.18448549509048462, "learning_rate": 2.7408490507417136e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8524 }, { "completion_length": 593.5, "epoch": 2.3628048780487805, "grad_norm": 0.0, "kl": 0.18236561119556427, "learning_rate": 2.740413291811301e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8525 }, { "completion_length": 642.25, "epoch": 2.3630820399113084, "grad_norm": 0.0, "kl": 0.17467138171195984, "learning_rate": 2.739977525508412e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8526 }, { "completion_length": 601.25, "epoch": 2.363359201773836, "grad_norm": 0.37292343378067017, "kl": 455701.15625, "learning_rate": 2.7395417518464114e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8527 }, { "completion_length": 654.5, "epoch": 2.3636363636363638, "grad_norm": 0.0, "kl": 0.15872570872306824, "learning_rate": 2.739105970838661e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8528 }, { "completion_length": 561.25, "epoch": 2.363913525498891, "grad_norm": 0.0, "kl": 0.1801506131887436, "learning_rate": 2.7386701824985257e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8529 }, { "completion_length": 586.75, "epoch": 2.364190687361419, "grad_norm": 0.0, "kl": 0.17617535591125488, "learning_rate": 2.7382343868393685e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8530 }, { "completion_length": 559.75, "epoch": 2.3644678492239466, "grad_norm": 0.0, "kl": 0.2172352522611618, "learning_rate": 2.7377985838745542e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8531 }, { "completion_length": 613.5, "epoch": 2.3647450110864745, "grad_norm": 0.0, "kl": 0.18140970170497894, "learning_rate": 2.7373627736174457e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8532 }, { "completion_length": 691.75, "epoch": 2.3650221729490024, "grad_norm": 0.0, "kl": 0.16443955898284912, "learning_rate": 2.7369269560814083e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8533 }, { "completion_length": 608.5, "epoch": 2.36529933481153, "grad_norm": 0.0, "kl": 0.21669630706310272, "learning_rate": 2.736491131279807e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8534 }, { "completion_length": 588.75, "epoch": 2.365576496674058, "grad_norm": 0.0, "kl": 0.18050137162208557, "learning_rate": 2.736055299226007e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8535 }, { "completion_length": 598.5, "epoch": 2.3658536585365852, "grad_norm": 0.0, "kl": 0.19034995138645172, "learning_rate": 2.7356194599333724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8536 }, { "completion_length": 558.25, "epoch": 2.366130820399113, "grad_norm": 0.0, "kl": 0.19937390089035034, "learning_rate": 2.735183613415269e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8537 }, { "completion_length": 745.25, "epoch": 2.3664079822616406, "grad_norm": 0.0, "kl": 0.17445413768291473, "learning_rate": 2.7347477596850637e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8538 }, { "completion_length": 633.0, "epoch": 2.3666851441241685, "grad_norm": 0.0, "kl": 0.13673913478851318, "learning_rate": 2.73431189875612e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8539 }, { "completion_length": 705.75, "epoch": 2.3669623059866964, "grad_norm": 0.0, "kl": 0.15641555190086365, "learning_rate": 2.7338760306418056e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8540 }, { "completion_length": 707.5, "epoch": 2.367239467849224, "grad_norm": 0.0, "kl": 0.1512550264596939, "learning_rate": 2.7334401553554858e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8541 }, { "completion_length": 529.5, "epoch": 2.367516629711752, "grad_norm": 0.0, "kl": 70308.515625, "learning_rate": 2.7330042729105276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8542 }, { "completion_length": 636.0, "epoch": 2.3677937915742793, "grad_norm": 0.0, "kl": 0.16203026473522186, "learning_rate": 2.7325683833202975e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8543 }, { "completion_length": 577.0, "epoch": 2.368070953436807, "grad_norm": 0.0, "kl": 0.1473502665758133, "learning_rate": 2.732132486598163e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8544 }, { "completion_length": 649.0, "epoch": 2.3683481152993346, "grad_norm": 0.0, "kl": 0.19982044398784637, "learning_rate": 2.7316965827574903e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8545 }, { "completion_length": 550.0, "epoch": 2.3686252771618626, "grad_norm": 0.0, "kl": 0.1865672469139099, "learning_rate": 2.731260671811648e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8546 }, { "completion_length": 661.0, "epoch": 2.3689024390243905, "grad_norm": 0.0, "kl": 0.1420600712299347, "learning_rate": 2.7308247537740023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8547 }, { "completion_length": 637.5, "epoch": 2.369179600886918, "grad_norm": 0.0, "kl": 0.171818345785141, "learning_rate": 2.7303888286579217e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8548 }, { "completion_length": 702.0, "epoch": 2.369456762749446, "grad_norm": 0.0, "kl": 0.20007407665252686, "learning_rate": 2.7299528964767736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8549 }, { "completion_length": 727.25, "epoch": 2.3697339246119733, "grad_norm": 0.0, "kl": 0.1976885050535202, "learning_rate": 2.729516957243928e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8550 }, { "completion_length": 573.75, "epoch": 2.370011086474501, "grad_norm": 0.0, "kl": 0.20111189782619476, "learning_rate": 2.729081010972751e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8551 }, { "completion_length": 636.75, "epoch": 2.3702882483370287, "grad_norm": 0.0, "kl": 0.1367342621088028, "learning_rate": 2.7286450576766123e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8552 }, { "completion_length": 753.0, "epoch": 2.3705654101995566, "grad_norm": 0.0, "kl": 0.14227844774723053, "learning_rate": 2.728209097368882e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8553 }, { "completion_length": 677.5, "epoch": 2.3708425720620845, "grad_norm": 0.0, "kl": 0.16041336953639984, "learning_rate": 2.7277731300629266e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8554 }, { "completion_length": 655.0, "epoch": 2.371119733924612, "grad_norm": 0.0, "kl": 0.17099711298942566, "learning_rate": 2.727337155772118e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8555 }, { "completion_length": 595.25, "epoch": 2.37139689578714, "grad_norm": 0.0, "kl": 0.1670798659324646, "learning_rate": 2.726901174509824e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8556 }, { "completion_length": 515.5, "epoch": 2.3716740576496673, "grad_norm": 0.0, "kl": 0.23512180149555206, "learning_rate": 2.7264651862894144e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8557 }, { "completion_length": 648.5, "epoch": 2.3719512195121952, "grad_norm": 0.0, "kl": 0.1962755024433136, "learning_rate": 2.7260291911242597e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8558 }, { "completion_length": 609.5, "epoch": 2.3722283813747227, "grad_norm": 0.0, "kl": 0.22084343433380127, "learning_rate": 2.725593189027731e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8559 }, { "completion_length": 607.25, "epoch": 2.3725055432372506, "grad_norm": 0.0, "kl": 0.14276789128780365, "learning_rate": 2.7251571800131966e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8560 }, { "completion_length": 636.5, "epoch": 2.3727827050997785, "grad_norm": 0.0, "kl": 0.14705151319503784, "learning_rate": 2.724721164094029e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8561 }, { "completion_length": 508.25, "epoch": 2.373059866962306, "grad_norm": 1.5254132747650146, "kl": 15171208609792.0, "learning_rate": 2.7242851412835973e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8562 }, { "completion_length": 543.25, "epoch": 2.373337028824834, "grad_norm": 0.0, "kl": 0.19262804090976715, "learning_rate": 2.7238491115952736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8563 }, { "completion_length": 627.5, "epoch": 2.3736141906873613, "grad_norm": 0.0, "kl": 0.7853958606719971, "learning_rate": 2.723413075042429e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8564 }, { "completion_length": 578.0, "epoch": 2.3738913525498893, "grad_norm": 0.0, "kl": 0.20516341924667358, "learning_rate": 2.722977031638435e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8565 }, { "completion_length": 632.0, "epoch": 2.3741685144124167, "grad_norm": 0.0, "kl": 0.23793084919452667, "learning_rate": 2.722540981396662e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8566 }, { "completion_length": 612.0, "epoch": 2.3744456762749446, "grad_norm": 0.0, "kl": 0.1566510647535324, "learning_rate": 2.7221049243304833e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8567 }, { "completion_length": 608.5, "epoch": 2.374722838137472, "grad_norm": 0.0, "kl": 94085677056.0, "learning_rate": 2.721668860453271e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8568 }, { "completion_length": 698.0, "epoch": 2.375, "grad_norm": 0.0, "kl": 0.192289799451828, "learning_rate": 2.7212327897783963e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8569 }, { "completion_length": 592.5, "epoch": 2.375277161862528, "grad_norm": 0.0, "kl": 0.1668900102376938, "learning_rate": 2.720796712319233e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8570 }, { "completion_length": 662.5, "epoch": 2.3755543237250554, "grad_norm": 0.0, "kl": 0.2044084668159485, "learning_rate": 2.720360628089152e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8571 }, { "completion_length": 591.5, "epoch": 2.3758314855875833, "grad_norm": 0.0, "kl": 0.1684889942407608, "learning_rate": 2.7199245371015274e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8572 }, { "completion_length": 584.25, "epoch": 2.3761086474501107, "grad_norm": 0.0, "kl": 0.23329418897628784, "learning_rate": 2.7194884393697325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8573 }, { "completion_length": 633.25, "epoch": 2.3763858093126387, "grad_norm": 0.0, "kl": 0.1590922474861145, "learning_rate": 2.7190523349071406e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8574 }, { "completion_length": 580.5, "epoch": 2.376662971175166, "grad_norm": 0.0, "kl": 0.17573866248130798, "learning_rate": 2.718616223727124e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8575 }, { "completion_length": 676.25, "epoch": 2.376940133037694, "grad_norm": 0.0, "kl": 0.3416716754436493, "learning_rate": 2.7181801058430585e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8576 }, { "completion_length": 612.5, "epoch": 2.3772172949002215, "grad_norm": 0.0, "kl": 0.17716586589813232, "learning_rate": 2.7177439812683155e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8577 }, { "completion_length": 634.5, "epoch": 2.3774944567627494, "grad_norm": 0.0, "kl": 0.2098720371723175, "learning_rate": 2.7173078500162708e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8578 }, { "completion_length": 790.5, "epoch": 2.3777716186252773, "grad_norm": 1.3296200037002563, "kl": 7270123962368.0, "learning_rate": 2.716871712100298e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8579 }, { "completion_length": 507.25, "epoch": 2.3780487804878048, "grad_norm": 0.0, "kl": 0.2014172524213791, "learning_rate": 2.716435567533772e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8580 }, { "completion_length": 597.75, "epoch": 2.3783259423503327, "grad_norm": 1.4118229150772095, "kl": 95573.2890625, "learning_rate": 2.715999416330068e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8581 }, { "completion_length": 666.5, "epoch": 2.37860310421286, "grad_norm": 0.0, "kl": 258863903277056.0, "learning_rate": 2.7155632585025605e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8582 }, { "completion_length": 577.25, "epoch": 2.378880266075388, "grad_norm": 0.0, "kl": 40077.12109375, "learning_rate": 2.7151270940646243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8583 }, { "completion_length": 580.75, "epoch": 2.3791574279379155, "grad_norm": 0.0, "kl": 0.1883448362350464, "learning_rate": 2.714690923029634e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8584 }, { "completion_length": 694.25, "epoch": 2.3794345898004434, "grad_norm": 0.0, "kl": 0.178286612033844, "learning_rate": 2.714254745410968e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8585 }, { "completion_length": 615.5, "epoch": 2.3797117516629713, "grad_norm": 0.0, "kl": 0.14497065544128418, "learning_rate": 2.7138185612219993e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8586 }, { "completion_length": 671.5, "epoch": 2.379988913525499, "grad_norm": 0.0, "kl": 0.16663680970668793, "learning_rate": 2.713382370476105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8587 }, { "completion_length": 579.75, "epoch": 2.3802660753880267, "grad_norm": 0.0, "kl": 0.18120668828487396, "learning_rate": 2.7129461731866602e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8588 }, { "completion_length": 564.25, "epoch": 2.380543237250554, "grad_norm": 0.0, "kl": 0.20887671411037445, "learning_rate": 2.712509969367043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8589 }, { "completion_length": 626.0, "epoch": 2.380820399113082, "grad_norm": 0.0, "kl": 0.16620029509067535, "learning_rate": 2.712073759030629e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8590 }, { "completion_length": 609.5, "epoch": 2.3810975609756095, "grad_norm": 0.0, "kl": 0.4595721960067749, "learning_rate": 2.7116375421907955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8591 }, { "completion_length": 588.5, "epoch": 2.3813747228381374, "grad_norm": 0.3867006003856659, "kl": 127649833811968.0, "learning_rate": 2.7112013188609182e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8592 }, { "completion_length": 575.25, "epoch": 2.3816518847006654, "grad_norm": 0.0, "kl": 0.20072627067565918, "learning_rate": 2.710765089054375e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8593 }, { "completion_length": 553.5, "epoch": 2.381929046563193, "grad_norm": 0.0, "kl": 0.20165331661701202, "learning_rate": 2.710328852784543e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8594 }, { "completion_length": 649.75, "epoch": 2.3822062084257207, "grad_norm": 0.0, "kl": 0.1857084035873413, "learning_rate": 2.709892610064801e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8595 }, { "completion_length": 572.75, "epoch": 2.382483370288248, "grad_norm": 0.0, "kl": 0.1981993466615677, "learning_rate": 2.709456360908525e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8596 }, { "completion_length": 790.0, "epoch": 2.382760532150776, "grad_norm": 0.0, "kl": 0.17582204937934875, "learning_rate": 2.709020105329094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8597 }, { "completion_length": 649.75, "epoch": 2.3830376940133036, "grad_norm": 0.0, "kl": 0.1576891839504242, "learning_rate": 2.7085838433398865e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8598 }, { "completion_length": 509.5, "epoch": 2.3833148558758315, "grad_norm": 0.0, "kl": 0.20124578475952148, "learning_rate": 2.708147574954279e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8599 }, { "completion_length": 598.5, "epoch": 2.3835920177383594, "grad_norm": 0.0, "kl": 0.17397044599056244, "learning_rate": 2.7077113001856525e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8600 }, { "completion_length": 692.5, "epoch": 2.383869179600887, "grad_norm": 0.0, "kl": 0.19617801904678345, "learning_rate": 2.7072750190473835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8601 }, { "completion_length": 597.75, "epoch": 2.3841463414634148, "grad_norm": 0.0, "kl": 0.22958159446716309, "learning_rate": 2.706838731552852e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8602 }, { "completion_length": 806.5, "epoch": 2.384423503325942, "grad_norm": 0.0, "kl": 0.9928058981895447, "learning_rate": 2.7064024377154374e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8603 }, { "completion_length": 830.0, "epoch": 2.38470066518847, "grad_norm": 0.0, "kl": 0.13908067345619202, "learning_rate": 2.7059661375485185e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8604 }, { "completion_length": 532.0, "epoch": 2.3849778270509976, "grad_norm": 0.0, "kl": 0.19504313170909882, "learning_rate": 2.705529831065475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8605 }, { "completion_length": 562.5, "epoch": 2.3852549889135255, "grad_norm": 0.0, "kl": 0.1473795771598816, "learning_rate": 2.7050935182796866e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8606 }, { "completion_length": 563.0, "epoch": 2.3855321507760534, "grad_norm": 0.0, "kl": 0.18602463603019714, "learning_rate": 2.7046571992045334e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8607 }, { "completion_length": 654.25, "epoch": 2.385809312638581, "grad_norm": 0.0, "kl": 0.2621820569038391, "learning_rate": 2.7042208738533947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8608 }, { "completion_length": 615.25, "epoch": 2.386086474501109, "grad_norm": 0.0, "kl": 0.17544329166412354, "learning_rate": 2.7037845422396513e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8609 }, { "completion_length": 644.5, "epoch": 2.3863636363636362, "grad_norm": 0.0, "kl": 0.1703096628189087, "learning_rate": 2.7033482043766844e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8610 }, { "completion_length": 630.75, "epoch": 2.386640798226164, "grad_norm": 0.0, "kl": 0.20983150601387024, "learning_rate": 2.702911860277873e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8611 }, { "completion_length": 610.5, "epoch": 2.3869179600886916, "grad_norm": 0.0, "kl": 0.16064268350601196, "learning_rate": 2.7024755099566004e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8612 }, { "completion_length": 614.75, "epoch": 2.3871951219512195, "grad_norm": 0.0, "kl": 0.15704794228076935, "learning_rate": 2.7020391534262445e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8613 }, { "completion_length": 634.75, "epoch": 2.3874722838137474, "grad_norm": 0.48554182052612305, "kl": 1139618971910144.0, "learning_rate": 2.701602790700189e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8614 }, { "completion_length": 572.75, "epoch": 2.387749445676275, "grad_norm": 0.0, "kl": 0.2238110899925232, "learning_rate": 2.7011664217918154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8615 }, { "completion_length": 628.75, "epoch": 2.388026607538803, "grad_norm": 0.0, "kl": 0.19179391860961914, "learning_rate": 2.700730046714503e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8616 }, { "completion_length": 522.0, "epoch": 2.3883037694013303, "grad_norm": 0.0, "kl": 4.229987144470215, "learning_rate": 2.7002936654816356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8617 }, { "completion_length": 665.75, "epoch": 2.388580931263858, "grad_norm": 0.0, "kl": 0.1479351669549942, "learning_rate": 2.6998572781065953e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8618 }, { "completion_length": 645.5, "epoch": 2.3888580931263856, "grad_norm": 0.0, "kl": 0.13853248953819275, "learning_rate": 2.699420884602763e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8619 }, { "completion_length": 740.75, "epoch": 2.3891352549889135, "grad_norm": 0.0, "kl": 0.17968019843101501, "learning_rate": 2.698984484983522e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8620 }, { "completion_length": 707.0, "epoch": 2.3894124168514415, "grad_norm": 0.0, "kl": 0.21328909695148468, "learning_rate": 2.6985480792622546e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8621 }, { "completion_length": 649.75, "epoch": 2.389689578713969, "grad_norm": 0.0, "kl": 0.1810053288936615, "learning_rate": 2.6981116674523433e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8622 }, { "completion_length": 561.75, "epoch": 2.389966740576497, "grad_norm": 0.0, "kl": 0.21581223607063293, "learning_rate": 2.697675249567172e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8623 }, { "completion_length": 592.5, "epoch": 2.3902439024390243, "grad_norm": 0.0, "kl": 0.17991824448108673, "learning_rate": 2.697238825620122e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8624 }, { "completion_length": 568.75, "epoch": 2.390521064301552, "grad_norm": 0.0, "kl": 0.18654002249240875, "learning_rate": 2.696802395624579e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8625 }, { "completion_length": 616.5, "epoch": 2.3907982261640797, "grad_norm": 0.0, "kl": 0.18531425297260284, "learning_rate": 2.696365959593924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8626 }, { "completion_length": 625.75, "epoch": 2.3910753880266076, "grad_norm": 0.0, "kl": 0.19192227721214294, "learning_rate": 2.695929517541543e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8627 }, { "completion_length": 557.0, "epoch": 2.3913525498891355, "grad_norm": 0.0, "kl": 2.6759579181671143, "learning_rate": 2.695493069480818e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8628 }, { "completion_length": 580.25, "epoch": 2.391629711751663, "grad_norm": 0.0, "kl": 0.2712588906288147, "learning_rate": 2.695056615425134e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8629 }, { "completion_length": 710.25, "epoch": 2.391906873614191, "grad_norm": 0.0, "kl": 0.18926431238651276, "learning_rate": 2.694620155387875e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8630 }, { "completion_length": 535.5, "epoch": 2.3921840354767183, "grad_norm": 0.0, "kl": 0.1920231729745865, "learning_rate": 2.6941836893824248e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8631 }, { "completion_length": 620.5, "epoch": 2.3924611973392462, "grad_norm": 0.41171973943710327, "kl": 532374752329728.0, "learning_rate": 2.693747217422169e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8632 }, { "completion_length": 650.25, "epoch": 2.3927383592017737, "grad_norm": 0.0, "kl": 0.25606563687324524, "learning_rate": 2.6933107395204926e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8633 }, { "completion_length": 493.5, "epoch": 2.3930155210643016, "grad_norm": 0.0, "kl": 1.0163042545318604, "learning_rate": 2.6928742556907795e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8634 }, { "completion_length": 584.25, "epoch": 2.3932926829268295, "grad_norm": 0.0, "kl": 0.3459542989730835, "learning_rate": 2.6924377659464152e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8635 }, { "completion_length": 567.25, "epoch": 2.393569844789357, "grad_norm": 0.0, "kl": 11870.490234375, "learning_rate": 2.692001270300786e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8636 }, { "completion_length": 611.5, "epoch": 2.393847006651885, "grad_norm": 0.0, "kl": 0.18299126625061035, "learning_rate": 2.6915647687672754e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8637 }, { "completion_length": 696.25, "epoch": 2.3941241685144123, "grad_norm": 0.33084267377853394, "kl": 91657554886656.0, "learning_rate": 2.691128261359271e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8638 }, { "completion_length": 732.0, "epoch": 2.3944013303769403, "grad_norm": 0.0, "kl": 0.15928179025650024, "learning_rate": 2.690691748090157e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8639 }, { "completion_length": 604.5, "epoch": 2.3946784922394677, "grad_norm": 0.0, "kl": 0.1720491349697113, "learning_rate": 2.6902552289733214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8640 }, { "completion_length": 649.5, "epoch": 2.3949556541019956, "grad_norm": 0.0, "kl": 0.16810722649097443, "learning_rate": 2.6898187040221484e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8641 }, { "completion_length": 632.75, "epoch": 2.395232815964523, "grad_norm": 0.0, "kl": 0.27929407358169556, "learning_rate": 2.6893821732500264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8642 }, { "completion_length": 582.5, "epoch": 2.395509977827051, "grad_norm": 0.0, "kl": 0.1706072986125946, "learning_rate": 2.6889456366703403e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8643 }, { "completion_length": 558.5, "epoch": 2.395787139689579, "grad_norm": 0.0, "kl": 12252458516480.0, "learning_rate": 2.688509094296478e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8644 }, { "completion_length": 654.0, "epoch": 2.3960643015521064, "grad_norm": 0.0, "kl": 0.584369421005249, "learning_rate": 2.6880725461418254e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8645 }, { "completion_length": 668.0, "epoch": 2.3963414634146343, "grad_norm": 0.0, "kl": 0.3541910648345947, "learning_rate": 2.6876359922197703e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8646 }, { "completion_length": 640.75, "epoch": 2.3966186252771617, "grad_norm": 0.0, "kl": 0.2016674429178238, "learning_rate": 2.6871994325436994e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8647 }, { "completion_length": 614.5, "epoch": 2.3968957871396896, "grad_norm": 0.0, "kl": 0.17343272268772125, "learning_rate": 2.686762867127002e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8648 }, { "completion_length": 611.75, "epoch": 2.397172949002217, "grad_norm": 0.0, "kl": 0.1707153171300888, "learning_rate": 2.6863262959830633e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8649 }, { "completion_length": 613.25, "epoch": 2.397450110864745, "grad_norm": 0.0, "kl": 0.1721877008676529, "learning_rate": 2.685889719125272e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8650 }, { "completion_length": 546.0, "epoch": 2.3977272727272725, "grad_norm": 0.0, "kl": 0.23318137228488922, "learning_rate": 2.685453136567017e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8651 }, { "completion_length": 572.75, "epoch": 2.3980044345898004, "grad_norm": 0.0, "kl": 108148.921875, "learning_rate": 2.6850165483216858e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8652 }, { "completion_length": 582.0, "epoch": 2.3982815964523283, "grad_norm": 0.0, "kl": 0.2605983018875122, "learning_rate": 2.6845799544026668e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8653 }, { "completion_length": 608.5, "epoch": 2.3985587583148558, "grad_norm": 0.0, "kl": 0.15554320812225342, "learning_rate": 2.684143354823347e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8654 }, { "completion_length": 590.0, "epoch": 2.3988359201773837, "grad_norm": 0.0, "kl": 0.2880902588367462, "learning_rate": 2.683706749597118e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8655 }, { "completion_length": 531.0, "epoch": 2.399113082039911, "grad_norm": 0.0, "kl": 0.22434408962726593, "learning_rate": 2.683270138737367e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8656 }, { "completion_length": 562.5, "epoch": 2.399390243902439, "grad_norm": 0.0, "kl": 0.1914694458246231, "learning_rate": 2.682833522257483e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8657 }, { "completion_length": 471.0, "epoch": 2.3996674057649665, "grad_norm": 0.0, "kl": 0.16133223474025726, "learning_rate": 2.6823969001708556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8658 }, { "completion_length": 695.25, "epoch": 2.3999445676274944, "grad_norm": 0.3698557913303375, "kl": 3696179019776.0, "learning_rate": 2.6819602724908744e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8659 }, { "completion_length": 583.75, "epoch": 2.4002217294900223, "grad_norm": 0.0, "kl": 0.18678885698318481, "learning_rate": 2.6815236392309274e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8660 }, { "completion_length": 500.25, "epoch": 2.40049889135255, "grad_norm": 0.0, "kl": 0.22982874512672424, "learning_rate": 2.6810870004044065e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8661 }, { "completion_length": 712.5, "epoch": 2.4007760532150777, "grad_norm": 0.29631271958351135, "kl": 0.15042121708393097, "learning_rate": 2.6806503560247e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8662 }, { "completion_length": 567.25, "epoch": 2.401053215077605, "grad_norm": 0.0, "kl": 0.16872750222682953, "learning_rate": 2.680213706105199e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8663 }, { "completion_length": 564.0, "epoch": 2.401330376940133, "grad_norm": 0.0, "kl": 1.077375888824463, "learning_rate": 2.679777050659293e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8664 }, { "completion_length": 627.25, "epoch": 2.4016075388026605, "grad_norm": 0.0, "kl": 0.3753209710121155, "learning_rate": 2.6793403897003728e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8665 }, { "completion_length": 634.5, "epoch": 2.4018847006651884, "grad_norm": 0.0, "kl": 0.16639478504657745, "learning_rate": 2.678903723241829e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8666 }, { "completion_length": 600.25, "epoch": 2.4021618625277164, "grad_norm": 0.0, "kl": 0.15818415582180023, "learning_rate": 2.678467051297052e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8667 }, { "completion_length": 602.75, "epoch": 2.402439024390244, "grad_norm": 0.0, "kl": 0.17421427369117737, "learning_rate": 2.678030373879434e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8668 }, { "completion_length": 655.5, "epoch": 2.4027161862527717, "grad_norm": 0.0, "kl": 0.3099769055843353, "learning_rate": 2.6775936910023633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8669 }, { "completion_length": 612.0, "epoch": 2.402993348115299, "grad_norm": 0.0, "kl": 0.1873106211423874, "learning_rate": 2.6771570026792337e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8670 }, { "completion_length": 619.25, "epoch": 2.403270509977827, "grad_norm": 0.0, "kl": 0.18533504009246826, "learning_rate": 2.6767203089234354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8671 }, { "completion_length": 704.5, "epoch": 2.4035476718403546, "grad_norm": 0.0, "kl": 0.25781282782554626, "learning_rate": 2.6762836097483615e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8672 }, { "completion_length": 706.25, "epoch": 2.4038248337028825, "grad_norm": 0.0, "kl": 0.15978728234767914, "learning_rate": 2.6758469051674017e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8673 }, { "completion_length": 665.0, "epoch": 2.4041019955654104, "grad_norm": 0.0, "kl": 0.1710873395204544, "learning_rate": 2.6754101951939494e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8674 }, { "completion_length": 618.75, "epoch": 2.404379157427938, "grad_norm": 0.0, "kl": 0.1986246407032013, "learning_rate": 2.674973479841395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8675 }, { "completion_length": 669.25, "epoch": 2.4046563192904657, "grad_norm": 0.0, "kl": 0.2073696404695511, "learning_rate": 2.6745367591231323e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8676 }, { "completion_length": 601.5, "epoch": 2.404933481152993, "grad_norm": 0.0, "kl": 0.17546433210372925, "learning_rate": 2.6741000330525537e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8677 }, { "completion_length": 595.5, "epoch": 2.405210643015521, "grad_norm": 0.0, "kl": 0.32593458890914917, "learning_rate": 2.6736633016430514e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8678 }, { "completion_length": 618.0, "epoch": 2.4054878048780486, "grad_norm": 0.0, "kl": 0.3953627943992615, "learning_rate": 2.6732265649080175e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8679 }, { "completion_length": 641.5, "epoch": 2.4057649667405765, "grad_norm": 0.0, "kl": 0.22186768054962158, "learning_rate": 2.672789822860846e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8680 }, { "completion_length": 680.75, "epoch": 2.4060421286031044, "grad_norm": 0.0, "kl": 0.15892241895198822, "learning_rate": 2.672353075514929e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8681 }, { "completion_length": 642.0, "epoch": 2.406319290465632, "grad_norm": 0.0, "kl": 0.24735388159751892, "learning_rate": 2.6719163228836603e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8682 }, { "completion_length": 618.5, "epoch": 2.4065964523281598, "grad_norm": 0.0, "kl": 0.19826337695121765, "learning_rate": 2.6714795649804336e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8683 }, { "completion_length": 633.25, "epoch": 2.4068736141906872, "grad_norm": 0.0, "kl": 0.1934579312801361, "learning_rate": 2.671042801818641e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8684 }, { "completion_length": 587.25, "epoch": 2.407150776053215, "grad_norm": 0.0, "kl": 2564445440.0, "learning_rate": 2.670606033411678e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8685 }, { "completion_length": 629.25, "epoch": 2.4074279379157426, "grad_norm": 0.0, "kl": 0.1803608387708664, "learning_rate": 2.670169259772937e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8686 }, { "completion_length": 592.0, "epoch": 2.4077050997782705, "grad_norm": 0.0, "kl": 0.20406118035316467, "learning_rate": 2.669732480915813e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8687 }, { "completion_length": 542.75, "epoch": 2.4079822616407984, "grad_norm": 0.0, "kl": 0.15442046523094177, "learning_rate": 2.6692956968537e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8688 }, { "completion_length": 594.75, "epoch": 2.408259423503326, "grad_norm": 0.0, "kl": 0.18489590287208557, "learning_rate": 2.6688589075999925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8689 }, { "completion_length": 577.75, "epoch": 2.408536585365854, "grad_norm": 0.0, "kl": 355142696960.0, "learning_rate": 2.668422113168084e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8690 }, { "completion_length": 596.75, "epoch": 2.4088137472283813, "grad_norm": 3.470867395401001, "kl": 31595308.0, "learning_rate": 2.66798531357137e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8691 }, { "completion_length": 595.0, "epoch": 2.409090909090909, "grad_norm": 0.0, "kl": 0.18992821872234344, "learning_rate": 2.6675485088232454e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8692 }, { "completion_length": 594.25, "epoch": 2.4093680709534366, "grad_norm": 0.38290950655937195, "kl": 873317120.0, "learning_rate": 2.667111698937106e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8693 }, { "completion_length": 654.25, "epoch": 2.4096452328159645, "grad_norm": 0.0, "kl": 0.1740352362394333, "learning_rate": 2.6666748839263447e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8694 }, { "completion_length": 698.0, "epoch": 2.4099223946784925, "grad_norm": 0.0, "kl": 0.18319067358970642, "learning_rate": 2.666238063804359e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8695 }, { "completion_length": 554.0, "epoch": 2.41019955654102, "grad_norm": 0.0, "kl": 0.2314160168170929, "learning_rate": 2.665801238584543e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8696 }, { "completion_length": 731.5, "epoch": 2.410476718403548, "grad_norm": 1.312657356262207, "kl": 9992343552.0, "learning_rate": 2.665364408280292e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8697 }, { "completion_length": 668.0, "epoch": 2.4107538802660753, "grad_norm": 0.0, "kl": 0.17174965143203735, "learning_rate": 2.664927572905003e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8698 }, { "completion_length": 651.25, "epoch": 2.411031042128603, "grad_norm": 0.0, "kl": 0.17183730006217957, "learning_rate": 2.664490732472072e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8699 }, { "completion_length": 626.75, "epoch": 2.4113082039911307, "grad_norm": 0.0, "kl": 0.14434649050235748, "learning_rate": 2.664053886994894e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8700 }, { "completion_length": 546.0, "epoch": 2.4115853658536586, "grad_norm": 0.0, "kl": 0.21489432454109192, "learning_rate": 2.6636170364868654e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8701 }, { "completion_length": 565.0, "epoch": 2.4118625277161865, "grad_norm": 0.0, "kl": 0.1678495854139328, "learning_rate": 2.663180180961383e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8702 }, { "completion_length": 679.0, "epoch": 2.412139689578714, "grad_norm": 0.0, "kl": 0.15071575343608856, "learning_rate": 2.6627433204318436e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8703 }, { "completion_length": 606.75, "epoch": 2.412416851441242, "grad_norm": 0.0, "kl": 0.16478195786476135, "learning_rate": 2.662306454911644e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8704 }, { "completion_length": 578.5, "epoch": 2.4126940133037693, "grad_norm": 0.6042741537094116, "kl": 231813480448.0, "learning_rate": 2.66186958441418e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8705 }, { "completion_length": 540.25, "epoch": 2.412971175166297, "grad_norm": 0.0, "kl": 0.19937309622764587, "learning_rate": 2.661432708952849e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8706 }, { "completion_length": 626.0, "epoch": 2.4132483370288247, "grad_norm": 0.0, "kl": 1636157489152.0, "learning_rate": 2.6609958285410488e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8707 }, { "completion_length": 488.25, "epoch": 2.4135254988913526, "grad_norm": 0.0, "kl": 0.26478850841522217, "learning_rate": 2.6605589431921763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8708 }, { "completion_length": 734.0, "epoch": 2.4138026607538805, "grad_norm": 0.5601027011871338, "kl": 0.18471187353134155, "learning_rate": 2.660122052919629e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8709 }, { "completion_length": 720.75, "epoch": 2.414079822616408, "grad_norm": 0.0, "kl": 0.15890052914619446, "learning_rate": 2.6596851577368043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8710 }, { "completion_length": 716.0, "epoch": 2.414356984478936, "grad_norm": 0.0, "kl": 0.13179360330104828, "learning_rate": 2.6592482576571e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8711 }, { "completion_length": 501.5, "epoch": 2.4146341463414633, "grad_norm": 0.7129942178726196, "kl": 2315.1171875, "learning_rate": 2.6588113526939135e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8712 }, { "completion_length": 623.75, "epoch": 2.4149113082039912, "grad_norm": 0.0, "kl": 0.1557929813861847, "learning_rate": 2.6583744428606444e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8713 }, { "completion_length": 516.25, "epoch": 2.4151884700665187, "grad_norm": 0.0, "kl": 0.2317117154598236, "learning_rate": 2.65793752817069e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8714 }, { "completion_length": 555.25, "epoch": 2.4154656319290466, "grad_norm": 0.47946736216545105, "kl": 128361.90625, "learning_rate": 2.657500608637448e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8715 }, { "completion_length": 530.5, "epoch": 2.4157427937915745, "grad_norm": 0.0, "kl": 1433674.625, "learning_rate": 2.657063684274317e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8716 }, { "completion_length": 695.25, "epoch": 2.416019955654102, "grad_norm": 0.0, "kl": 0.17004512250423431, "learning_rate": 2.6566267550946973e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8717 }, { "completion_length": 597.75, "epoch": 2.41629711751663, "grad_norm": 0.0, "kl": 0.14887675642967224, "learning_rate": 2.656189821111986e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8718 }, { "completion_length": 622.0, "epoch": 2.4165742793791574, "grad_norm": 0.0, "kl": 0.14349226653575897, "learning_rate": 2.6557528823395836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8719 }, { "completion_length": 634.25, "epoch": 2.4168514412416853, "grad_norm": 0.0, "kl": 0.16291770339012146, "learning_rate": 2.6553159387908868e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8720 }, { "completion_length": 578.25, "epoch": 2.4171286031042127, "grad_norm": 0.0, "kl": 0.18790613114833832, "learning_rate": 2.6548789904792966e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8721 }, { "completion_length": 656.25, "epoch": 2.4174057649667406, "grad_norm": 0.6084274649620056, "kl": 31253739520.0, "learning_rate": 2.6544420374182122e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8722 }, { "completion_length": 625.25, "epoch": 2.417682926829268, "grad_norm": 0.0, "kl": 0.14702297747135162, "learning_rate": 2.6540050796210334e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8723 }, { "completion_length": 585.25, "epoch": 2.417960088691796, "grad_norm": 0.0, "kl": 0.16000647842884064, "learning_rate": 2.653568117101159e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8724 }, { "completion_length": 623.5, "epoch": 2.4182372505543235, "grad_norm": 0.0, "kl": 0.16278015077114105, "learning_rate": 2.65313114987199e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8725 }, { "completion_length": 652.25, "epoch": 2.4185144124168514, "grad_norm": 0.0, "kl": 0.1537848711013794, "learning_rate": 2.6526941779469244e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8726 }, { "completion_length": 514.5, "epoch": 2.4187915742793793, "grad_norm": 0.0, "kl": 0.28677085041999817, "learning_rate": 2.652257201339365e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8727 }, { "completion_length": 611.25, "epoch": 2.4190687361419068, "grad_norm": 0.0, "kl": 0.2288229763507843, "learning_rate": 2.6518202200627086e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8728 }, { "completion_length": 598.0, "epoch": 2.4193458980044347, "grad_norm": 0.0, "kl": 0.1863863319158554, "learning_rate": 2.6513832341303593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8729 }, { "completion_length": 653.75, "epoch": 2.419623059866962, "grad_norm": 0.0, "kl": 0.18106673657894135, "learning_rate": 2.6509462435557155e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8730 }, { "completion_length": 711.25, "epoch": 2.41990022172949, "grad_norm": 0.0, "kl": 0.1283816397190094, "learning_rate": 2.6505092483521776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8731 }, { "completion_length": 693.75, "epoch": 2.4201773835920175, "grad_norm": 0.0, "kl": 0.15947012603282928, "learning_rate": 2.6500722485331488e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8732 }, { "completion_length": 677.0, "epoch": 2.4204545454545454, "grad_norm": 0.0, "kl": 0.14757739007472992, "learning_rate": 2.6496352441120273e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8733 }, { "completion_length": 580.25, "epoch": 2.4207317073170733, "grad_norm": 0.0, "kl": 0.18842220306396484, "learning_rate": 2.6491982351022153e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8734 }, { "completion_length": 763.75, "epoch": 2.421008869179601, "grad_norm": 0.2811007499694824, "kl": 0.12795273959636688, "learning_rate": 2.6487612215171142e-06, "loss": 0.0, "reward": 5.71875, "reward_std": 0.0625, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 8735 }, { "completion_length": 616.0, "epoch": 2.4212860310421287, "grad_norm": 0.0, "kl": 0.16497254371643066, "learning_rate": 2.648324203370126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8736 }, { "completion_length": 632.0, "epoch": 2.421563192904656, "grad_norm": 0.0, "kl": 0.22752901911735535, "learning_rate": 2.6478871806746496e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8737 }, { "completion_length": 669.5, "epoch": 2.421840354767184, "grad_norm": 0.0, "kl": 0.1511225551366806, "learning_rate": 2.6474501534440905e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8738 }, { "completion_length": 554.5, "epoch": 2.4221175166297115, "grad_norm": 0.0, "kl": 0.16636668145656586, "learning_rate": 2.6470131216918475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8739 }, { "completion_length": 627.75, "epoch": 2.4223946784922394, "grad_norm": 0.0, "kl": 0.1463824361562729, "learning_rate": 2.646576085431325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8740 }, { "completion_length": 634.0, "epoch": 2.4226718403547673, "grad_norm": 0.0, "kl": 0.18706026673316956, "learning_rate": 2.6461390446759223e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8741 }, { "completion_length": 590.75, "epoch": 2.422949002217295, "grad_norm": 0.0, "kl": 0.16821858286857605, "learning_rate": 2.6457019994390435e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8742 }, { "completion_length": 595.5, "epoch": 2.4232261640798227, "grad_norm": 0.0, "kl": 0.17480744421482086, "learning_rate": 2.6452649497340897e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8743 }, { "completion_length": 688.75, "epoch": 2.42350332594235, "grad_norm": 0.0, "kl": 4163603456.0, "learning_rate": 2.6448278955744655e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8744 }, { "completion_length": 698.5, "epoch": 2.423780487804878, "grad_norm": 0.0, "kl": 0.1874070167541504, "learning_rate": 2.6443908369735715e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8745 }, { "completion_length": 644.75, "epoch": 2.4240576496674056, "grad_norm": 0.33571556210517883, "kl": 0.13487504422664642, "learning_rate": 2.6439537739448114e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8746 }, { "completion_length": 673.75, "epoch": 2.4243348115299335, "grad_norm": 0.0, "kl": 0.17562606930732727, "learning_rate": 2.6435167065015875e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8747 }, { "completion_length": 660.0, "epoch": 2.4246119733924614, "grad_norm": 0.0, "kl": 0.38029903173446655, "learning_rate": 2.6430796346573033e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8748 }, { "completion_length": 604.75, "epoch": 2.424889135254989, "grad_norm": 0.0, "kl": 0.1573161780834198, "learning_rate": 2.6426425584253625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8749 }, { "completion_length": 552.0, "epoch": 2.4251662971175167, "grad_norm": 0.0, "kl": 0.19506272673606873, "learning_rate": 2.6422054778191674e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8750 }, { "completion_length": 668.0, "epoch": 2.425443458980044, "grad_norm": 0.0, "kl": 5706.47802734375, "learning_rate": 2.6417683928521215e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8751 }, { "completion_length": 601.5, "epoch": 2.425720620842572, "grad_norm": 0.0, "kl": 376851.375, "learning_rate": 2.6413313035376294e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8752 }, { "completion_length": 525.0, "epoch": 2.4259977827050996, "grad_norm": 0.0, "kl": 0.17107196152210236, "learning_rate": 2.6408942098890937e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8753 }, { "completion_length": 501.25, "epoch": 2.4262749445676275, "grad_norm": 0.0, "kl": 0.2397196739912033, "learning_rate": 2.6404571119199184e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8754 }, { "completion_length": 599.25, "epoch": 2.4265521064301554, "grad_norm": 1.7398676872253418, "kl": 353028.1875, "learning_rate": 2.6400200096435093e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8755 }, { "completion_length": 582.5, "epoch": 2.426829268292683, "grad_norm": 0.0, "kl": 0.16015568375587463, "learning_rate": 2.639582903073267e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8756 }, { "completion_length": 531.75, "epoch": 2.4271064301552108, "grad_norm": 0.0, "kl": 0.17346154153347015, "learning_rate": 2.6391457922225994e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8757 }, { "completion_length": 618.25, "epoch": 2.4273835920177382, "grad_norm": 0.0, "kl": 0.15333542227745056, "learning_rate": 2.6387086771049074e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8758 }, { "completion_length": 524.5, "epoch": 2.427660753880266, "grad_norm": 0.0, "kl": 0.5541488528251648, "learning_rate": 2.6382715577335986e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8759 }, { "completion_length": 632.25, "epoch": 2.4279379157427936, "grad_norm": 1.5017828941345215, "kl": 1432.7032470703125, "learning_rate": 2.637834434122076e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8760 }, { "completion_length": 776.75, "epoch": 2.4282150776053215, "grad_norm": 0.0, "kl": 0.2278553694486618, "learning_rate": 2.6373973062837447e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8761 }, { "completion_length": 618.0, "epoch": 2.4284922394678494, "grad_norm": 0.0, "kl": 0.16915474832057953, "learning_rate": 2.6369601742320096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8762 }, { "completion_length": 594.75, "epoch": 2.428769401330377, "grad_norm": 2.1737184524536133, "kl": 0.17025716602802277, "learning_rate": 2.636523037980275e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8763 }, { "completion_length": 684.75, "epoch": 2.429046563192905, "grad_norm": 0.0, "kl": 472494211072.0, "learning_rate": 2.636085897541948e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8764 }, { "completion_length": 639.75, "epoch": 2.4293237250554323, "grad_norm": 0.0, "kl": 0.14817668497562408, "learning_rate": 2.635648752930432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8765 }, { "completion_length": 658.75, "epoch": 2.42960088691796, "grad_norm": 0.0, "kl": 0.22824105620384216, "learning_rate": 2.6352116041591324e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8766 }, { "completion_length": 641.5, "epoch": 2.4298780487804876, "grad_norm": 0.0, "kl": 0.13611949980258942, "learning_rate": 2.634774451241456e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8767 }, { "completion_length": 569.75, "epoch": 2.4301552106430155, "grad_norm": 0.0, "kl": 26.859233856201172, "learning_rate": 2.634337294190808e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8768 }, { "completion_length": 676.0, "epoch": 2.4304323725055434, "grad_norm": 0.0, "kl": 0.13362745940685272, "learning_rate": 2.6339001330205938e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8769 }, { "completion_length": 570.5, "epoch": 2.430709534368071, "grad_norm": 0.0, "kl": 0.324704647064209, "learning_rate": 2.6334629677442195e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8770 }, { "completion_length": 653.25, "epoch": 2.430986696230599, "grad_norm": 0.0, "kl": 0.17389796674251556, "learning_rate": 2.633025798375092e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8771 }, { "completion_length": 605.75, "epoch": 2.4312638580931263, "grad_norm": 0.0, "kl": 0.15207041800022125, "learning_rate": 2.6325886249266157e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8772 }, { "completion_length": 602.75, "epoch": 2.431541019955654, "grad_norm": 0.0, "kl": 2436.83740234375, "learning_rate": 2.6321514474121973e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8773 }, { "completion_length": 527.5, "epoch": 2.4318181818181817, "grad_norm": 0.0, "kl": 0.5269466042518616, "learning_rate": 2.631714265845245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8774 }, { "completion_length": 663.25, "epoch": 2.4320953436807096, "grad_norm": 0.0, "kl": 0.15590377151966095, "learning_rate": 2.631277080239163e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8775 }, { "completion_length": 700.25, "epoch": 2.4323725055432375, "grad_norm": 0.36764639616012573, "kl": 0.16016311943531036, "learning_rate": 2.6308398906073603e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8776 }, { "completion_length": 610.75, "epoch": 2.432649667405765, "grad_norm": 0.35803499817848206, "kl": 0.5242290496826172, "learning_rate": 2.630402696963241e-06, "loss": 0.0, "reward": 1.5, "reward_std": 0.5, "rewards/confident_score_func": -0.25, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8777 }, { "completion_length": 620.25, "epoch": 2.432926829268293, "grad_norm": 0.0, "kl": 0.1622748076915741, "learning_rate": 2.6299654993202146e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8778 }, { "completion_length": 511.25, "epoch": 2.4332039911308203, "grad_norm": 0.0, "kl": 0.14452797174453735, "learning_rate": 2.6295282976916863e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8779 }, { "completion_length": 573.25, "epoch": 2.433481152993348, "grad_norm": 0.0, "kl": 0.2334454357624054, "learning_rate": 2.6290910920910643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8780 }, { "completion_length": 638.0, "epoch": 2.4337583148558757, "grad_norm": 0.0, "kl": 7.832675457000732, "learning_rate": 2.6286538825317556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8781 }, { "completion_length": 563.25, "epoch": 2.4340354767184036, "grad_norm": 0.0, "kl": 0.3869216740131378, "learning_rate": 2.6282166690271664e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8782 }, { "completion_length": 664.5, "epoch": 2.4343126385809315, "grad_norm": 0.0, "kl": 0.1535598784685135, "learning_rate": 2.627779451590707e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8783 }, { "completion_length": 592.0, "epoch": 2.434589800443459, "grad_norm": 0.0, "kl": 0.1545201539993286, "learning_rate": 2.627342230235782e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8784 }, { "completion_length": 675.5, "epoch": 2.434866962305987, "grad_norm": 0.0, "kl": 0.1413305252790451, "learning_rate": 2.6269050049758016e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8785 }, { "completion_length": 675.0, "epoch": 2.4351441241685143, "grad_norm": 0.0, "kl": 0.17277084290981293, "learning_rate": 2.626467775824172e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8786 }, { "completion_length": 668.0, "epoch": 2.4354212860310422, "grad_norm": 0.0, "kl": 414197.5625, "learning_rate": 2.626030542794302e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8787 }, { "completion_length": 542.0, "epoch": 2.4356984478935697, "grad_norm": 0.0, "kl": 0.16156792640686035, "learning_rate": 2.625593305899599e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8788 }, { "completion_length": 621.75, "epoch": 2.4359756097560976, "grad_norm": 0.0, "kl": 0.1815786361694336, "learning_rate": 2.625156065153473e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8789 }, { "completion_length": 605.25, "epoch": 2.4362527716186255, "grad_norm": 0.0, "kl": 0.1584380567073822, "learning_rate": 2.6247188205693295e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8790 }, { "completion_length": 662.75, "epoch": 2.436529933481153, "grad_norm": 0.3356369137763977, "kl": 0.26752084493637085, "learning_rate": 2.62428157216058e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8791 }, { "completion_length": 576.0, "epoch": 2.436807095343681, "grad_norm": 0.0, "kl": 0.16001002490520477, "learning_rate": 2.623844319940631e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8792 }, { "completion_length": 659.5, "epoch": 2.4370842572062084, "grad_norm": 0.0, "kl": 0.1568446308374405, "learning_rate": 2.6234070639228924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8793 }, { "completion_length": 665.25, "epoch": 2.4373614190687363, "grad_norm": 0.0, "kl": 0.16627894341945648, "learning_rate": 2.622969804120772e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8794 }, { "completion_length": 612.5, "epoch": 2.4376385809312637, "grad_norm": 0.0, "kl": 0.18633483350276947, "learning_rate": 2.6225325405476794e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8795 }, { "completion_length": 595.75, "epoch": 2.4379157427937916, "grad_norm": 0.0, "kl": 0.15092246234416962, "learning_rate": 2.622095273217023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8796 }, { "completion_length": 761.75, "epoch": 2.438192904656319, "grad_norm": 0.0, "kl": 0.18289926648139954, "learning_rate": 2.621658002142213e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8797 }, { "completion_length": 728.5, "epoch": 2.438470066518847, "grad_norm": 0.3683505058288574, "kl": 947513131008.0, "learning_rate": 2.6212207273366587e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8798 }, { "completion_length": 549.25, "epoch": 2.4387472283813745, "grad_norm": 0.0, "kl": 0.1754075437784195, "learning_rate": 2.620783448813768e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8799 }, { "completion_length": 576.25, "epoch": 2.4390243902439024, "grad_norm": 0.0, "kl": 0.15641388297080994, "learning_rate": 2.6203461665869516e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8800 }, { "completion_length": 610.0, "epoch": 2.4393015521064303, "grad_norm": 0.0, "kl": 0.20170491933822632, "learning_rate": 2.6199088806696195e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8801 }, { "completion_length": 642.0, "epoch": 2.4395787139689578, "grad_norm": 0.0, "kl": 0.19545473158359528, "learning_rate": 2.6194715910751806e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8802 }, { "completion_length": 591.5, "epoch": 2.4398558758314857, "grad_norm": 0.0, "kl": 0.5259287357330322, "learning_rate": 2.6190342978170434e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8803 }, { "completion_length": 596.75, "epoch": 2.440133037694013, "grad_norm": 0.0, "kl": 0.17446862161159515, "learning_rate": 2.6185970009086215e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8804 }, { "completion_length": 619.25, "epoch": 2.440410199556541, "grad_norm": 0.5775512456893921, "kl": 195688923136.0, "learning_rate": 2.6181597003633218e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8805 }, { "completion_length": 590.5, "epoch": 2.4406873614190685, "grad_norm": 0.5183906555175781, "kl": 7636.86328125, "learning_rate": 2.617722396194557e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8806 }, { "completion_length": 453.5, "epoch": 2.4409645232815964, "grad_norm": 0.0, "kl": 0.16367916762828827, "learning_rate": 2.617285088415735e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8807 }, { "completion_length": 839.25, "epoch": 2.4412416851441243, "grad_norm": 0.24638375639915466, "kl": 398697922560.0, "learning_rate": 2.616847777040267e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8808 }, { "completion_length": 625.25, "epoch": 2.441518847006652, "grad_norm": 0.0, "kl": 0.15271902084350586, "learning_rate": 2.6164104620815643e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8809 }, { "completion_length": 521.75, "epoch": 2.4417960088691797, "grad_norm": 0.0, "kl": 0.18279224634170532, "learning_rate": 2.615973143553037e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8810 }, { "completion_length": 675.25, "epoch": 2.442073170731707, "grad_norm": 0.0, "kl": 0.1758686602115631, "learning_rate": 2.615535821468095e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8811 }, { "completion_length": 686.0, "epoch": 2.442350332594235, "grad_norm": 0.0, "kl": 312468.625, "learning_rate": 2.615098495840151e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8812 }, { "completion_length": 664.0, "epoch": 2.4426274944567625, "grad_norm": 0.843011200428009, "kl": 3.6796135902404785, "learning_rate": 2.614661166682614e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8813 }, { "completion_length": 540.25, "epoch": 2.4429046563192904, "grad_norm": 0.0, "kl": 0.1807791143655777, "learning_rate": 2.6142238340088965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8814 }, { "completion_length": 645.0, "epoch": 2.4431818181818183, "grad_norm": 0.0, "kl": 0.3004243075847626, "learning_rate": 2.6137864978324097e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8815 }, { "completion_length": 570.25, "epoch": 2.443458980044346, "grad_norm": 0.0, "kl": 0.2965641915798187, "learning_rate": 2.6133491581665634e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8816 }, { "completion_length": 643.75, "epoch": 2.4437361419068737, "grad_norm": 0.0, "kl": 1383.577880859375, "learning_rate": 2.612911815024771e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8817 }, { "completion_length": 624.25, "epoch": 2.444013303769401, "grad_norm": 0.0, "kl": 0.24736887216567993, "learning_rate": 2.612474468420441e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8818 }, { "completion_length": 585.25, "epoch": 2.444290465631929, "grad_norm": 0.0, "kl": 0.172450989484787, "learning_rate": 2.612037118366989e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8819 }, { "completion_length": 588.5, "epoch": 2.4445676274944566, "grad_norm": 0.0, "kl": 0.49643149971961975, "learning_rate": 2.6115997648778236e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8820 }, { "completion_length": 606.25, "epoch": 2.4448447893569845, "grad_norm": 0.0, "kl": 0.20689480006694794, "learning_rate": 2.6111624079663585e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8821 }, { "completion_length": 966.5, "epoch": 2.4451219512195124, "grad_norm": 0.0, "kl": 113907.2890625, "learning_rate": 2.610725047646004e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8822 }, { "completion_length": 789.25, "epoch": 2.44539911308204, "grad_norm": 0.0, "kl": 0.16982407867908478, "learning_rate": 2.610287683930173e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8823 }, { "completion_length": 634.25, "epoch": 2.4456762749445677, "grad_norm": 0.4024440348148346, "kl": 1229526.0, "learning_rate": 2.6098503168322776e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8824 }, { "completion_length": 585.25, "epoch": 2.445953436807095, "grad_norm": 0.0, "kl": 0.15555350482463837, "learning_rate": 2.6094129463657303e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8825 }, { "completion_length": 636.75, "epoch": 2.446230598669623, "grad_norm": 0.0, "kl": 0.20333059132099152, "learning_rate": 2.608975572543942e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8826 }, { "completion_length": 718.5, "epoch": 2.4465077605321506, "grad_norm": 0.0, "kl": 0.14810864627361298, "learning_rate": 2.6085381953803274e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8827 }, { "completion_length": 657.0, "epoch": 2.4467849223946785, "grad_norm": 0.0, "kl": 0.19950862228870392, "learning_rate": 2.608100814888297e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8828 }, { "completion_length": 590.25, "epoch": 2.4470620842572064, "grad_norm": 0.0, "kl": 0.20293352007865906, "learning_rate": 2.6076634310812644e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8829 }, { "completion_length": 604.75, "epoch": 2.447339246119734, "grad_norm": 0.0, "kl": 0.20028778910636902, "learning_rate": 2.607226043972643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8830 }, { "completion_length": 680.5, "epoch": 2.4476164079822618, "grad_norm": 0.0, "kl": 0.1332813948392868, "learning_rate": 2.606788653575844e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8831 }, { "completion_length": 518.25, "epoch": 2.4478935698447892, "grad_norm": 0.0, "kl": 0.19253210723400116, "learning_rate": 2.6063512599042823e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8832 }, { "completion_length": 561.75, "epoch": 2.448170731707317, "grad_norm": 0.0, "kl": 0.16375768184661865, "learning_rate": 2.605913862971369e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8833 }, { "completion_length": 603.75, "epoch": 2.4484478935698446, "grad_norm": 0.0, "kl": 0.23093348741531372, "learning_rate": 2.6054764627905183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8834 }, { "completion_length": 667.0, "epoch": 2.4487250554323725, "grad_norm": 0.0, "kl": 0.18091611564159393, "learning_rate": 2.605039059375143e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8835 }, { "completion_length": 600.75, "epoch": 2.4490022172949004, "grad_norm": 0.0, "kl": 0.18566250801086426, "learning_rate": 2.6046016527386577e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8836 }, { "completion_length": 598.75, "epoch": 2.449279379157428, "grad_norm": 0.0, "kl": 0.19218827784061432, "learning_rate": 2.6041642428944737e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8837 }, { "completion_length": 669.5, "epoch": 2.449556541019956, "grad_norm": 0.0, "kl": 0.17302168905735016, "learning_rate": 2.6037268298560064e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8838 }, { "completion_length": 774.25, "epoch": 2.4498337028824833, "grad_norm": 0.0, "kl": 0.16613881289958954, "learning_rate": 2.6032894136366682e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8839 }, { "completion_length": 562.5, "epoch": 2.450110864745011, "grad_norm": 0.0, "kl": 0.16668039560317993, "learning_rate": 2.6028519942498733e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8840 }, { "completion_length": 595.5, "epoch": 2.4503880266075386, "grad_norm": 0.0, "kl": 0.1915445327758789, "learning_rate": 2.602414571709036e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8841 }, { "completion_length": 653.75, "epoch": 2.4506651884700665, "grad_norm": 0.0, "kl": 0.15265823900699615, "learning_rate": 2.6019771460275702e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8842 }, { "completion_length": 956.75, "epoch": 2.4509423503325944, "grad_norm": 1.1558430194854736, "kl": 0.34963858127593994, "learning_rate": 2.6015397172188885e-06, "loss": -0.0, "reward": 1.71875, "reward_std": 0.0625, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 8843 }, { "completion_length": 581.25, "epoch": 2.451219512195122, "grad_norm": 1.9964147806167603, "kl": 128.29638671875, "learning_rate": 2.601102285296407e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8844 }, { "completion_length": 524.25, "epoch": 2.45149667405765, "grad_norm": 0.0, "kl": 0.2757244408130646, "learning_rate": 2.6006648502735384e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8845 }, { "completion_length": 548.5, "epoch": 2.4517738359201773, "grad_norm": 0.0, "kl": 0.19117015600204468, "learning_rate": 2.6002274121636976e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8846 }, { "completion_length": 611.25, "epoch": 2.452050997782705, "grad_norm": 0.0, "kl": 0.18247123062610626, "learning_rate": 2.599789970980299e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8847 }, { "completion_length": 689.5, "epoch": 2.4523281596452327, "grad_norm": 0.0, "kl": 0.2378302663564682, "learning_rate": 2.599352526736757e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8848 }, { "completion_length": 598.0, "epoch": 2.4526053215077606, "grad_norm": 0.0, "kl": 0.20083792507648468, "learning_rate": 2.5989150794464867e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8849 }, { "completion_length": 640.75, "epoch": 2.4528824833702885, "grad_norm": 0.0, "kl": 136271.1875, "learning_rate": 2.5984776291229026e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8850 }, { "completion_length": 601.75, "epoch": 2.453159645232816, "grad_norm": 0.0, "kl": 0.16249173879623413, "learning_rate": 2.5980401757794195e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8851 }, { "completion_length": 683.5, "epoch": 2.453436807095344, "grad_norm": 0.0, "kl": 0.16301055252552032, "learning_rate": 2.5976027194294514e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8852 }, { "completion_length": 576.0, "epoch": 2.4537139689578713, "grad_norm": 0.0, "kl": 0.21641848981380463, "learning_rate": 2.597165260086414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8853 }, { "completion_length": 600.0, "epoch": 2.453991130820399, "grad_norm": 0.0, "kl": 0.21133725345134735, "learning_rate": 2.596727797763722e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8854 }, { "completion_length": 592.5, "epoch": 2.4542682926829267, "grad_norm": 0.6671132445335388, "kl": 410904821760.0, "learning_rate": 2.5962903324747913e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8855 }, { "completion_length": 702.5, "epoch": 2.4545454545454546, "grad_norm": 0.0, "kl": 0.18479590117931366, "learning_rate": 2.5958528642330362e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8856 }, { "completion_length": 742.0, "epoch": 2.4548226164079825, "grad_norm": 0.0, "kl": 0.1577301174402237, "learning_rate": 2.5954153930518727e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8857 }, { "completion_length": 697.0, "epoch": 2.45509977827051, "grad_norm": 0.0, "kl": 0.2460438758134842, "learning_rate": 2.594977918944716e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8858 }, { "completion_length": 754.75, "epoch": 2.455376940133038, "grad_norm": 0.0, "kl": 0.1536053717136383, "learning_rate": 2.594540441924982e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8859 }, { "completion_length": 528.5, "epoch": 2.4556541019955653, "grad_norm": 0.0, "kl": 0.176492840051651, "learning_rate": 2.5941029620060853e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8860 }, { "completion_length": 607.5, "epoch": 2.4559312638580932, "grad_norm": 0.0, "kl": 0.1862160861492157, "learning_rate": 2.5936654792014427e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8861 }, { "completion_length": 674.25, "epoch": 2.4562084257206207, "grad_norm": 0.0, "kl": 0.1605859398841858, "learning_rate": 2.59322799352447e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8862 }, { "completion_length": 566.5, "epoch": 2.4564855875831486, "grad_norm": 0.0, "kl": 0.23617608845233917, "learning_rate": 2.5927905049885817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8863 }, { "completion_length": 577.75, "epoch": 2.4567627494456765, "grad_norm": 0.0, "kl": 0.2141299694776535, "learning_rate": 2.5923530136071944e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8864 }, { "completion_length": 626.25, "epoch": 2.457039911308204, "grad_norm": 0.0, "kl": 0.22779248654842377, "learning_rate": 2.5919155193937244e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8865 }, { "completion_length": 627.5, "epoch": 2.457317073170732, "grad_norm": 0.0, "kl": 0.16732558608055115, "learning_rate": 2.591478022361589e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8866 }, { "completion_length": 616.75, "epoch": 2.4575942350332594, "grad_norm": 0.45859649777412415, "kl": 0.32024338841438293, "learning_rate": 2.591040522524202e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8867 }, { "completion_length": 702.5, "epoch": 2.4578713968957873, "grad_norm": 0.0, "kl": 0.23360221087932587, "learning_rate": 2.5906030198949812e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8868 }, { "completion_length": 637.0, "epoch": 2.4581485587583147, "grad_norm": 0.0, "kl": 0.2123386114835739, "learning_rate": 2.5901655144873428e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8869 }, { "completion_length": 685.75, "epoch": 2.4584257206208426, "grad_norm": 0.0, "kl": 0.16614055633544922, "learning_rate": 2.5897280063147025e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8870 }, { "completion_length": 573.25, "epoch": 2.45870288248337, "grad_norm": 0.0, "kl": 0.18076308071613312, "learning_rate": 2.589290495390478e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8871 }, { "completion_length": 601.75, "epoch": 2.458980044345898, "grad_norm": 0.0, "kl": 0.14873650670051575, "learning_rate": 2.588852981728086e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8872 }, { "completion_length": 675.0, "epoch": 2.4592572062084255, "grad_norm": 0.0, "kl": 0.16341814398765564, "learning_rate": 2.5884154653409415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8873 }, { "completion_length": 551.25, "epoch": 2.4595343680709534, "grad_norm": 0.0, "kl": 0.16254954040050507, "learning_rate": 2.5879779462424635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8874 }, { "completion_length": 714.75, "epoch": 2.4598115299334813, "grad_norm": 0.0, "kl": 0.22399353981018066, "learning_rate": 2.5875404244460678e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8875 }, { "completion_length": 584.0, "epoch": 2.4600886917960088, "grad_norm": 0.0, "kl": 0.16444940865039825, "learning_rate": 2.587102899965171e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8876 }, { "completion_length": 613.0, "epoch": 2.4603658536585367, "grad_norm": 0.0, "kl": 0.17880979180335999, "learning_rate": 2.586665372813191e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8877 }, { "completion_length": 605.5, "epoch": 2.460643015521064, "grad_norm": 0.0, "kl": 0.19506068527698517, "learning_rate": 2.586227843003544e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8878 }, { "completion_length": 679.75, "epoch": 2.460920177383592, "grad_norm": 0.0, "kl": 0.14525598287582397, "learning_rate": 2.585790310549648e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8879 }, { "completion_length": 571.5, "epoch": 2.4611973392461195, "grad_norm": 3.275465726852417, "kl": 6.4467949867248535, "learning_rate": 2.58535277546492e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8880 }, { "completion_length": 559.0, "epoch": 2.4614745011086474, "grad_norm": 0.0, "kl": 0.15713196992874146, "learning_rate": 2.584915237762778e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8881 }, { "completion_length": 590.0, "epoch": 2.4617516629711753, "grad_norm": 0.0, "kl": 0.1899988353252411, "learning_rate": 2.584477697456638e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8882 }, { "completion_length": 674.5, "epoch": 2.462028824833703, "grad_norm": 0.0, "kl": 0.22747842967510223, "learning_rate": 2.584040154559919e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8883 }, { "completion_length": 593.75, "epoch": 2.4623059866962307, "grad_norm": 0.6749280691146851, "kl": 2513729.0, "learning_rate": 2.583602609086038e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8884 }, { "completion_length": 626.0, "epoch": 2.462583148558758, "grad_norm": 0.0, "kl": 0.28925183415412903, "learning_rate": 2.583165061048412e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8885 }, { "completion_length": 646.0, "epoch": 2.462860310421286, "grad_norm": 0.0, "kl": 0.15698277950286865, "learning_rate": 2.5827275104604606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8886 }, { "completion_length": 687.75, "epoch": 2.4631374722838135, "grad_norm": 0.0, "kl": 0.19263955950737, "learning_rate": 2.5822899573356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8887 }, { "completion_length": 659.75, "epoch": 2.4634146341463414, "grad_norm": 0.0, "kl": 0.2019902467727661, "learning_rate": 2.5818524016872493e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8888 }, { "completion_length": 654.75, "epoch": 2.4636917960088693, "grad_norm": 0.0, "kl": 0.17768892645835876, "learning_rate": 2.5814148435288257e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8889 }, { "completion_length": 643.5, "epoch": 2.463968957871397, "grad_norm": 0.0, "kl": 0.19386419653892517, "learning_rate": 2.5809772828737474e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8890 }, { "completion_length": 687.25, "epoch": 2.4642461197339247, "grad_norm": 0.0, "kl": 0.16465239226818085, "learning_rate": 2.5805397197354333e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8891 }, { "completion_length": 623.0, "epoch": 2.464523281596452, "grad_norm": 0.0, "kl": 0.15660955011844635, "learning_rate": 2.5801021541273e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8892 }, { "completion_length": 575.0, "epoch": 2.46480044345898, "grad_norm": 0.0, "kl": 0.41794273257255554, "learning_rate": 2.5796645860627665e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8893 }, { "completion_length": 680.0, "epoch": 2.4650776053215075, "grad_norm": 0.0, "kl": 0.17882563173770905, "learning_rate": 2.579227015555252e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8894 }, { "completion_length": 661.5, "epoch": 2.4653547671840355, "grad_norm": 0.0, "kl": 0.15812890231609344, "learning_rate": 2.578789442618176e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8895 }, { "completion_length": 593.0, "epoch": 2.4656319290465634, "grad_norm": 0.0, "kl": 0.2033405900001526, "learning_rate": 2.5783518672649538e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8896 }, { "completion_length": 690.0, "epoch": 2.465909090909091, "grad_norm": 0.0, "kl": 0.169041708111763, "learning_rate": 2.5779142895090066e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8897 }, { "completion_length": 630.25, "epoch": 2.4661862527716187, "grad_norm": 0.0, "kl": 0.16931140422821045, "learning_rate": 2.577476709363752e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8898 }, { "completion_length": 612.75, "epoch": 2.466463414634146, "grad_norm": 0.0, "kl": 0.1483866572380066, "learning_rate": 2.577039126842609e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8899 }, { "completion_length": 716.75, "epoch": 2.466740576496674, "grad_norm": 0.0, "kl": 0.13844828307628632, "learning_rate": 2.5766015419589963e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8900 }, { "completion_length": 702.75, "epoch": 2.4670177383592016, "grad_norm": 0.0, "kl": 0.14608223736286163, "learning_rate": 2.5761639547263328e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8901 }, { "completion_length": 662.25, "epoch": 2.4672949002217295, "grad_norm": 0.0, "kl": 0.1555640995502472, "learning_rate": 2.575726365158038e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8902 }, { "completion_length": 663.5, "epoch": 2.4675720620842574, "grad_norm": 0.0, "kl": 1.155191421508789, "learning_rate": 2.5752887732675307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8903 }, { "completion_length": 576.25, "epoch": 2.467849223946785, "grad_norm": 0.0, "kl": 0.25175654888153076, "learning_rate": 2.57485117906823e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8904 }, { "completion_length": 765.75, "epoch": 2.4681263858093128, "grad_norm": 0.0, "kl": 0.20115135610103607, "learning_rate": 2.574413582573555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8905 }, { "completion_length": 569.0, "epoch": 2.4684035476718402, "grad_norm": 0.0, "kl": 20515150.0, "learning_rate": 2.5739759837969247e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8906 }, { "completion_length": 624.0, "epoch": 2.468680709534368, "grad_norm": 0.0, "kl": 0.15852154791355133, "learning_rate": 2.5735383827517592e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8907 }, { "completion_length": 661.0, "epoch": 2.4689578713968956, "grad_norm": 0.0, "kl": 0.17412425577640533, "learning_rate": 2.573100779451476e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8908 }, { "completion_length": 547.25, "epoch": 2.4692350332594235, "grad_norm": 0.0, "kl": 0.24125626683235168, "learning_rate": 2.572663173909497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8909 }, { "completion_length": 689.25, "epoch": 2.4695121951219514, "grad_norm": 0.0, "kl": 0.1840345561504364, "learning_rate": 2.5722255661392415e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8910 }, { "completion_length": 689.25, "epoch": 2.469789356984479, "grad_norm": 0.0, "kl": 0.17429675161838531, "learning_rate": 2.571787956154127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8911 }, { "completion_length": 641.0, "epoch": 2.470066518847007, "grad_norm": 0.0, "kl": 0.19435113668441772, "learning_rate": 2.571350343967575e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8912 }, { "completion_length": 637.75, "epoch": 2.4703436807095343, "grad_norm": 0.0, "kl": 0.1882115602493286, "learning_rate": 2.5709127295930054e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8913 }, { "completion_length": 566.25, "epoch": 2.470620842572062, "grad_norm": 0.0, "kl": 0.18666982650756836, "learning_rate": 2.5704751130438365e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8914 }, { "completion_length": 721.5, "epoch": 2.4708980044345896, "grad_norm": 0.0, "kl": 0.18339672684669495, "learning_rate": 2.570037494333489e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8915 }, { "completion_length": 581.5, "epoch": 2.4711751662971175, "grad_norm": 0.0, "kl": 0.19578254222869873, "learning_rate": 2.569599873475383e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8916 }, { "completion_length": 585.5, "epoch": 2.4714523281596454, "grad_norm": 0.0, "kl": 0.2250385880470276, "learning_rate": 2.5691622504829396e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8917 }, { "completion_length": 549.0, "epoch": 2.471729490022173, "grad_norm": 0.0, "kl": 0.18206915259361267, "learning_rate": 2.5687246253695765e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8918 }, { "completion_length": 643.25, "epoch": 2.472006651884701, "grad_norm": 0.3818148672580719, "kl": 3824601.25, "learning_rate": 2.5682869981487154e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8919 }, { "completion_length": 680.0, "epoch": 2.4722838137472283, "grad_norm": 0.0, "kl": 0.15544487535953522, "learning_rate": 2.5678493688337758e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8920 }, { "completion_length": 737.5, "epoch": 2.472560975609756, "grad_norm": 0.0, "kl": 0.16719336807727814, "learning_rate": 2.5674117374381785e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8921 }, { "completion_length": 559.25, "epoch": 2.4728381374722836, "grad_norm": 0.0, "kl": 0.203790083527565, "learning_rate": 2.5669741039753435e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8922 }, { "completion_length": 676.5, "epoch": 2.4731152993348116, "grad_norm": 0.0, "kl": 0.19442713260650635, "learning_rate": 2.5665364684586913e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8923 }, { "completion_length": 614.25, "epoch": 2.4733924611973395, "grad_norm": 0.0, "kl": 0.175573468208313, "learning_rate": 2.5660988309016425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8924 }, { "completion_length": 743.5, "epoch": 2.473669623059867, "grad_norm": 0.0, "kl": 0.1806601583957672, "learning_rate": 2.565661191317618e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8925 }, { "completion_length": 656.5, "epoch": 2.473946784922395, "grad_norm": 0.0, "kl": 0.1423848271369934, "learning_rate": 2.565223549720037e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8926 }, { "completion_length": 583.75, "epoch": 2.4742239467849223, "grad_norm": 0.0, "kl": 0.18914276361465454, "learning_rate": 2.5647859061223216e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8927 }, { "completion_length": 660.75, "epoch": 2.47450110864745, "grad_norm": 0.0, "kl": 0.2113407701253891, "learning_rate": 2.564348260537892e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8928 }, { "completion_length": 605.25, "epoch": 2.4747782705099777, "grad_norm": 0.0, "kl": 0.18788428604602814, "learning_rate": 2.563910612980169e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8929 }, { "completion_length": 609.75, "epoch": 2.4750554323725056, "grad_norm": 0.0, "kl": 0.22733446955680847, "learning_rate": 2.563472963462573e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8930 }, { "completion_length": 638.75, "epoch": 2.4753325942350335, "grad_norm": 0.0, "kl": 0.24253495037555695, "learning_rate": 2.5630353119985254e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8931 }, { "completion_length": 652.25, "epoch": 2.475609756097561, "grad_norm": 0.0, "kl": 0.2548671364784241, "learning_rate": 2.562597658601447e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8932 }, { "completion_length": 707.25, "epoch": 2.475886917960089, "grad_norm": 0.0, "kl": 0.1991874724626541, "learning_rate": 2.5621600032847594e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8933 }, { "completion_length": 663.25, "epoch": 2.4761640798226163, "grad_norm": 0.0, "kl": 0.1576969474554062, "learning_rate": 2.5617223460618825e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8934 }, { "completion_length": 708.0, "epoch": 2.4764412416851442, "grad_norm": 0.0, "kl": 0.18834592401981354, "learning_rate": 2.5612846869462384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8935 }, { "completion_length": 710.5, "epoch": 2.4767184035476717, "grad_norm": 0.0, "kl": 0.8956484198570251, "learning_rate": 2.5608470259512476e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8936 }, { "completion_length": 691.0, "epoch": 2.4769955654101996, "grad_norm": 0.4762595295906067, "kl": 732739.625, "learning_rate": 2.560409363090331e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8937 }, { "completion_length": 712.0, "epoch": 2.4772727272727275, "grad_norm": 0.0, "kl": 0.16801021993160248, "learning_rate": 2.559971698376911e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8938 }, { "completion_length": 601.25, "epoch": 2.477549889135255, "grad_norm": 0.0, "kl": 0.20018693804740906, "learning_rate": 2.559534031824409e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8939 }, { "completion_length": 581.25, "epoch": 2.477827050997783, "grad_norm": 0.0, "kl": 0.17890064418315887, "learning_rate": 2.5590963634462456e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8940 }, { "completion_length": 621.75, "epoch": 2.4781042128603104, "grad_norm": 0.0, "kl": 0.14739418029785156, "learning_rate": 2.5586586932558422e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8941 }, { "completion_length": 658.5, "epoch": 2.4783813747228383, "grad_norm": 0.0, "kl": 0.1634860634803772, "learning_rate": 2.5582210212666212e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8942 }, { "completion_length": 712.5, "epoch": 2.4786585365853657, "grad_norm": 0.0, "kl": 0.16565744578838348, "learning_rate": 2.5577833474920035e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8943 }, { "completion_length": 667.0, "epoch": 2.4789356984478936, "grad_norm": 0.0, "kl": 0.15376582741737366, "learning_rate": 2.5573456719454106e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8944 }, { "completion_length": 825.5, "epoch": 2.479212860310421, "grad_norm": 0.0, "kl": 0.144003763794899, "learning_rate": 2.556907994640264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8945 }, { "completion_length": 818.5, "epoch": 2.479490022172949, "grad_norm": 0.0, "kl": 0.1470692902803421, "learning_rate": 2.556470315589987e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8946 }, { "completion_length": 664.75, "epoch": 2.479767184035477, "grad_norm": 0.0, "kl": 0.19512224197387695, "learning_rate": 2.556032634808e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8947 }, { "completion_length": 668.75, "epoch": 2.4800443458980044, "grad_norm": 0.0, "kl": 0.167424276471138, "learning_rate": 2.555594952307725e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8948 }, { "completion_length": 728.25, "epoch": 2.4803215077605323, "grad_norm": 0.0, "kl": 0.23485824465751648, "learning_rate": 2.555157268102585e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8949 }, { "completion_length": 653.25, "epoch": 2.4805986696230597, "grad_norm": 0.0, "kl": 0.1690482646226883, "learning_rate": 2.554719582206e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8950 }, { "completion_length": 581.5, "epoch": 2.4808758314855877, "grad_norm": 0.0, "kl": 0.2363327145576477, "learning_rate": 2.5542818946313936e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8951 }, { "completion_length": 532.5, "epoch": 2.481152993348115, "grad_norm": 0.0, "kl": 0.2848353385925293, "learning_rate": 2.553844205392186e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8952 }, { "completion_length": 576.5, "epoch": 2.481430155210643, "grad_norm": 0.0, "kl": 0.23040764033794403, "learning_rate": 2.5534065145018023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8953 }, { "completion_length": 720.0, "epoch": 2.4817073170731705, "grad_norm": 0.0, "kl": 0.16048230230808258, "learning_rate": 2.552968821973662e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8954 }, { "completion_length": 646.75, "epoch": 2.4819844789356984, "grad_norm": 0.0, "kl": 0.25997236371040344, "learning_rate": 2.5525311278211888e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8955 }, { "completion_length": 664.0, "epoch": 2.4822616407982263, "grad_norm": 0.0, "kl": 176230.6875, "learning_rate": 2.5520934320578045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8956 }, { "completion_length": 608.75, "epoch": 2.4825388026607538, "grad_norm": 0.0, "kl": 0.20572078227996826, "learning_rate": 2.5516557346969313e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8957 }, { "completion_length": 543.0, "epoch": 2.4828159645232817, "grad_norm": 0.0, "kl": 0.18288709223270416, "learning_rate": 2.5512180357519913e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8958 }, { "completion_length": 576.25, "epoch": 2.483093126385809, "grad_norm": 0.0, "kl": 0.15965671837329865, "learning_rate": 2.550780335236407e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8959 }, { "completion_length": 761.0, "epoch": 2.483370288248337, "grad_norm": 0.0, "kl": 0.14440859854221344, "learning_rate": 2.550342633163601e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8960 }, { "completion_length": 559.0, "epoch": 2.4836474501108645, "grad_norm": 1.8033663034439087, "kl": 207958.375, "learning_rate": 2.5499049295469964e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8961 }, { "completion_length": 586.25, "epoch": 2.4839246119733924, "grad_norm": 0.0, "kl": 0.16749948263168335, "learning_rate": 2.549467224400015e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8962 }, { "completion_length": 640.25, "epoch": 2.4842017738359203, "grad_norm": 0.0, "kl": 0.37944164872169495, "learning_rate": 2.549029517736079e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8963 }, { "completion_length": 679.25, "epoch": 2.484478935698448, "grad_norm": 0.42839816212654114, "kl": 0.1427091658115387, "learning_rate": 2.548591809568613e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8964 }, { "completion_length": 516.25, "epoch": 2.4847560975609757, "grad_norm": 3.278233051300049, "kl": 2740246.0, "learning_rate": 2.548154099911037e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8965 }, { "completion_length": 763.75, "epoch": 2.485033259423503, "grad_norm": 0.0, "kl": 0.1389949470758438, "learning_rate": 2.5477163887767766e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8966 }, { "completion_length": 745.5, "epoch": 2.485310421286031, "grad_norm": 0.0, "kl": 0.1572851687669754, "learning_rate": 2.547278676179251e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8967 }, { "completion_length": 585.75, "epoch": 2.4855875831485585, "grad_norm": 0.0, "kl": 0.19591118395328522, "learning_rate": 2.546840962131887e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8968 }, { "completion_length": 661.75, "epoch": 2.4858647450110865, "grad_norm": 0.0, "kl": 0.18001268804073334, "learning_rate": 2.5464032466481043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8969 }, { "completion_length": 697.75, "epoch": 2.4861419068736144, "grad_norm": 0.0, "kl": 0.18052907288074493, "learning_rate": 2.5459655297413275e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8970 }, { "completion_length": 604.75, "epoch": 2.486419068736142, "grad_norm": 0.0, "kl": 0.23481903970241547, "learning_rate": 2.545527811424979e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8971 }, { "completion_length": 649.75, "epoch": 2.4866962305986697, "grad_norm": 0.39636924862861633, "kl": 0.23785504698753357, "learning_rate": 2.5450900917124828e-06, "loss": 0.0, "reward": 2.875, "reward_std": 1.9311050176620483, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8972 }, { "completion_length": 708.25, "epoch": 2.486973392461197, "grad_norm": 0.0, "kl": 1001.8834838867188, "learning_rate": 2.5446523706172598e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8973 }, { "completion_length": 697.5, "epoch": 2.487250554323725, "grad_norm": 0.0, "kl": 0.15905901789665222, "learning_rate": 2.5442146481527348e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8974 }, { "completion_length": 638.5, "epoch": 2.4875277161862526, "grad_norm": 3.572207450866699, "kl": 142967.125, "learning_rate": 2.5437769243323302e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8975 }, { "completion_length": 713.25, "epoch": 2.4878048780487805, "grad_norm": 0.0, "kl": 0.17009049654006958, "learning_rate": 2.54333919916947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8976 }, { "completion_length": 695.0, "epoch": 2.4880820399113084, "grad_norm": 0.0, "kl": 0.743990421295166, "learning_rate": 2.5429014726775765e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8977 }, { "completion_length": 583.25, "epoch": 2.488359201773836, "grad_norm": 1.751011610031128, "kl": 1033307.5, "learning_rate": 2.5424637448700727e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8978 }, { "completion_length": 720.5, "epoch": 2.4886363636363638, "grad_norm": 0.0, "kl": 0.1919908970594406, "learning_rate": 2.5420260157603836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8979 }, { "completion_length": 651.5, "epoch": 2.488913525498891, "grad_norm": 0.0, "kl": 0.17328932881355286, "learning_rate": 2.541588285361931e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8980 }, { "completion_length": 572.75, "epoch": 2.489190687361419, "grad_norm": 0.0, "kl": 0.21755914390087128, "learning_rate": 2.5411505536881393e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8981 }, { "completion_length": 637.5, "epoch": 2.4894678492239466, "grad_norm": 0.0, "kl": 0.17123766243457794, "learning_rate": 2.5407128207524295e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8982 }, { "completion_length": 869.5, "epoch": 2.4897450110864745, "grad_norm": 0.0, "kl": 0.13598987460136414, "learning_rate": 2.5402750865682283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8983 }, { "completion_length": 619.0, "epoch": 2.4900221729490024, "grad_norm": 0.0, "kl": 20715.25, "learning_rate": 2.539837351148958e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8984 }, { "completion_length": 567.5, "epoch": 2.49029933481153, "grad_norm": 0.0, "kl": 0.175933837890625, "learning_rate": 2.5393996145080413e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8985 }, { "completion_length": 588.0, "epoch": 2.490576496674058, "grad_norm": 0.0, "kl": 0.2554168403148651, "learning_rate": 2.538961876658902e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8986 }, { "completion_length": 602.0, "epoch": 2.4908536585365852, "grad_norm": 0.0, "kl": 0.24539169669151306, "learning_rate": 2.538524137614965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8987 }, { "completion_length": 656.0, "epoch": 2.491130820399113, "grad_norm": 2.2006850242614746, "kl": 1020.5533447265625, "learning_rate": 2.538086397389652e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8988 }, { "completion_length": 653.5, "epoch": 2.4914079822616406, "grad_norm": 0.0, "kl": 0.1600663959980011, "learning_rate": 2.537648655996388e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8989 }, { "completion_length": 583.5, "epoch": 2.4916851441241685, "grad_norm": 0.0, "kl": 0.20554061233997345, "learning_rate": 2.5372109134485956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8990 }, { "completion_length": 636.5, "epoch": 2.4919623059866964, "grad_norm": 0.9992343783378601, "kl": 0.8575076460838318, "learning_rate": 2.5367731697597004e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8991 }, { "completion_length": 639.5, "epoch": 2.492239467849224, "grad_norm": 0.0, "kl": 0.15895915031433105, "learning_rate": 2.5363354249431245e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8992 }, { "completion_length": 625.25, "epoch": 2.492516629711752, "grad_norm": 0.0, "kl": 0.1976517289876938, "learning_rate": 2.5358976790122924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8993 }, { "completion_length": 657.75, "epoch": 2.4927937915742793, "grad_norm": 0.0, "kl": 2683728.0, "learning_rate": 2.535459931980628e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8994 }, { "completion_length": 669.25, "epoch": 2.493070953436807, "grad_norm": 0.0, "kl": 0.2170460969209671, "learning_rate": 2.5350221838615545e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8995 }, { "completion_length": 753.0, "epoch": 2.4933481152993346, "grad_norm": 0.0, "kl": 0.14450976252555847, "learning_rate": 2.5345844346684974e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8996 }, { "completion_length": 658.25, "epoch": 2.4936252771618626, "grad_norm": 0.3716540038585663, "kl": 0.21064484119415283, "learning_rate": 2.5341466844148775e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8997 }, { "completion_length": 622.0, "epoch": 2.4939024390243905, "grad_norm": 0.0, "kl": 0.21418096125125885, "learning_rate": 2.5337089331141225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8998 }, { "completion_length": 740.5, "epoch": 2.494179600886918, "grad_norm": 0.0, "kl": 0.2746717631816864, "learning_rate": 2.5332711807796545e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 8999 }, { "completion_length": 711.0, "epoch": 2.494456762749446, "grad_norm": 2.2118587493896484, "kl": 233758965760.0, "learning_rate": 2.532833427424898e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9000 }, { "completion_length": 626.25, "epoch": 2.4947339246119733, "grad_norm": 0.0, "kl": 0.18828444182872772, "learning_rate": 2.532395673063276e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9001 }, { "completion_length": 724.5, "epoch": 2.495011086474501, "grad_norm": 0.0, "kl": 0.16551636159420013, "learning_rate": 2.531957917708214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9002 }, { "completion_length": 652.75, "epoch": 2.4952882483370287, "grad_norm": 0.0, "kl": 0.17992472648620605, "learning_rate": 2.5315201613731354e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9003 }, { "completion_length": 649.5, "epoch": 2.4955654101995566, "grad_norm": 0.0, "kl": 0.15916268527507782, "learning_rate": 2.5310824040714654e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9004 }, { "completion_length": 781.75, "epoch": 2.4958425720620845, "grad_norm": 0.0, "kl": 6.9057230949401855, "learning_rate": 2.530644645816626e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9005 }, { "completion_length": 578.25, "epoch": 2.496119733924612, "grad_norm": 0.0, "kl": 0.19156722724437714, "learning_rate": 2.5302068866220438e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9006 }, { "completion_length": 683.5, "epoch": 2.49639689578714, "grad_norm": 0.0, "kl": 20.01877784729004, "learning_rate": 2.5297691265011415e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9007 }, { "completion_length": 743.25, "epoch": 2.4966740576496673, "grad_norm": 1.7300723791122437, "kl": 526.27587890625, "learning_rate": 2.5293313654673447e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9008 }, { "completion_length": 729.75, "epoch": 2.4969512195121952, "grad_norm": 1.9781357049942017, "kl": 0.16200318932533264, "learning_rate": 2.528893603534076e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9009 }, { "completion_length": 550.0, "epoch": 2.4972283813747227, "grad_norm": 0.0, "kl": 0.1656763255596161, "learning_rate": 2.5284558407147606e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9010 }, { "completion_length": 694.5, "epoch": 2.4975055432372506, "grad_norm": 0.0, "kl": 1025.2462158203125, "learning_rate": 2.528018077022824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9011 }, { "completion_length": 504.0, "epoch": 2.4977827050997785, "grad_norm": 0.0, "kl": 0.18637223541736603, "learning_rate": 2.5275803124716892e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9012 }, { "completion_length": 685.75, "epoch": 2.498059866962306, "grad_norm": 0.0, "kl": 0.19382181763648987, "learning_rate": 2.52714254707478e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9013 }, { "completion_length": 581.0, "epoch": 2.498337028824834, "grad_norm": 0.0, "kl": 0.1719403713941574, "learning_rate": 2.5267047808455225e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9014 }, { "completion_length": 524.0, "epoch": 2.4986141906873613, "grad_norm": 0.0, "kl": 1627734.0, "learning_rate": 2.5262670137973413e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9015 }, { "completion_length": 648.25, "epoch": 2.4988913525498893, "grad_norm": 0.0, "kl": 0.3161774277687073, "learning_rate": 2.5258292459436584e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9016 }, { "completion_length": 667.25, "epoch": 2.4991685144124167, "grad_norm": 0.0, "kl": 0.38328853249549866, "learning_rate": 2.5253914772979014e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9017 }, { "completion_length": 580.75, "epoch": 2.4994456762749446, "grad_norm": 0.7631409168243408, "kl": 165687.984375, "learning_rate": 2.5249537078734924e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9018 }, { "completion_length": 646.25, "epoch": 2.499722838137472, "grad_norm": 0.0, "kl": 0.376765638589859, "learning_rate": 2.524515937683858e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9019 }, { "completion_length": 592.5, "epoch": 2.5, "grad_norm": 0.0, "kl": 0.21660000085830688, "learning_rate": 2.52407816674242e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9020 }, { "completion_length": 601.5, "epoch": 2.5002771618625275, "grad_norm": 0.0, "kl": 14995384434688.0, "learning_rate": 2.5236403950626063e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9021 }, { "completion_length": 860.25, "epoch": 2.5005543237250554, "grad_norm": 0.0, "kl": 0.17550761997699738, "learning_rate": 2.52320262265784e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9022 }, { "completion_length": 639.75, "epoch": 2.5008314855875833, "grad_norm": 0.0, "kl": 0.17461906373500824, "learning_rate": 2.522764849541546e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9023 }, { "completion_length": 606.0, "epoch": 2.5011086474501107, "grad_norm": 0.0, "kl": 0.24124214053153992, "learning_rate": 2.5223270757271478e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9024 }, { "completion_length": 630.75, "epoch": 2.5013858093126387, "grad_norm": 0.0, "kl": 0.23395270109176636, "learning_rate": 2.521889301228071e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9025 }, { "completion_length": 720.75, "epoch": 2.5016629711751666, "grad_norm": 0.0, "kl": 0.14105580747127533, "learning_rate": 2.5214515260577414e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9026 }, { "completion_length": 713.75, "epoch": 2.501940133037694, "grad_norm": 0.0, "kl": 0.37321653962135315, "learning_rate": 2.5210137502295817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9027 }, { "completion_length": 590.5, "epoch": 2.5022172949002215, "grad_norm": 0.0, "kl": 47513178112.0, "learning_rate": 2.5205759737570184e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9028 }, { "completion_length": 613.25, "epoch": 2.5024944567627494, "grad_norm": 0.0, "kl": 3.3224728107452393, "learning_rate": 2.5201381966534748e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9029 }, { "completion_length": 664.25, "epoch": 2.5027716186252773, "grad_norm": 0.0, "kl": 0.2963912785053253, "learning_rate": 2.5197004189323777e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9030 }, { "completion_length": 657.75, "epoch": 2.5030487804878048, "grad_norm": 0.0, "kl": 10.800850868225098, "learning_rate": 2.5192626406071496e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9031 }, { "completion_length": 727.25, "epoch": 2.5033259423503327, "grad_norm": 0.0, "kl": 33701.05078125, "learning_rate": 2.5188248616912176e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9032 }, { "completion_length": 852.5, "epoch": 2.5036031042128606, "grad_norm": 0.0, "kl": 0.1288691759109497, "learning_rate": 2.5183870821980044e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9033 }, { "completion_length": 749.0, "epoch": 2.503880266075388, "grad_norm": 0.4709855914115906, "kl": 0.15245608985424042, "learning_rate": 2.517949302140936e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9034 }, { "completion_length": 778.0, "epoch": 2.5041574279379155, "grad_norm": 0.0, "kl": 9638278.0, "learning_rate": 2.5175115215334374e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9035 }, { "completion_length": 640.5, "epoch": 2.5044345898004434, "grad_norm": 0.0, "kl": 0.21510586142539978, "learning_rate": 2.5170737403889334e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9036 }, { "completion_length": 586.5, "epoch": 2.5047117516629713, "grad_norm": 0.0, "kl": 0.23719017207622528, "learning_rate": 2.5166359587208483e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9037 }, { "completion_length": 648.5, "epoch": 2.504988913525499, "grad_norm": 0.0, "kl": 3.6371655464172363, "learning_rate": 2.5161981765426088e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9038 }, { "completion_length": 683.25, "epoch": 2.5052660753880267, "grad_norm": 2.327660322189331, "kl": 1632.437255859375, "learning_rate": 2.5157603938676374e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9039 }, { "completion_length": 614.75, "epoch": 2.505543237250554, "grad_norm": 0.0, "kl": 0.20726734399795532, "learning_rate": 2.5153226107093604e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9040 }, { "completion_length": 733.0, "epoch": 2.505820399113082, "grad_norm": 0.0, "kl": 0.17484045028686523, "learning_rate": 2.5148848270812025e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9041 }, { "completion_length": 634.25, "epoch": 2.5060975609756095, "grad_norm": 0.0, "kl": 0.1638828068971634, "learning_rate": 2.5144470429965895e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9042 }, { "completion_length": 680.0, "epoch": 2.5063747228381374, "grad_norm": 0.0, "kl": 0.1656600534915924, "learning_rate": 2.5140092584689453e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9043 }, { "completion_length": 625.25, "epoch": 2.5066518847006654, "grad_norm": 0.0, "kl": 0.17263637483119965, "learning_rate": 2.5135714735116957e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9044 }, { "completion_length": 620.25, "epoch": 2.506929046563193, "grad_norm": 0.0, "kl": 0.2253798246383667, "learning_rate": 2.5131336881382658e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9045 }, { "completion_length": 664.5, "epoch": 2.5072062084257207, "grad_norm": 0.0, "kl": 0.16538795828819275, "learning_rate": 2.5126959023620793e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9046 }, { "completion_length": 730.5, "epoch": 2.507483370288248, "grad_norm": 0.5285708904266357, "kl": 961447.0, "learning_rate": 2.512258116196563e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9047 }, { "completion_length": 688.0, "epoch": 2.507760532150776, "grad_norm": 0.3883794844150543, "kl": 6094394.0, "learning_rate": 2.511820329655141e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9048 }, { "completion_length": 643.0, "epoch": 2.5080376940133036, "grad_norm": 0.0, "kl": 0.17803142964839935, "learning_rate": 2.511382542751239e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9049 }, { "completion_length": 656.5, "epoch": 2.5083148558758315, "grad_norm": 1.5503462553024292, "kl": 741.6503295898438, "learning_rate": 2.5109447554982813e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9050 }, { "completion_length": 668.5, "epoch": 2.5085920177383594, "grad_norm": 0.0, "kl": 0.20787811279296875, "learning_rate": 2.5105069679096942e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9051 }, { "completion_length": 626.75, "epoch": 2.508869179600887, "grad_norm": 0.0, "kl": 0.2955753803253174, "learning_rate": 2.5100691799989013e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9052 }, { "completion_length": 746.5, "epoch": 2.5091463414634148, "grad_norm": 0.0, "kl": 0.1770602911710739, "learning_rate": 2.5096313917793297e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9053 }, { "completion_length": 576.0, "epoch": 2.509423503325942, "grad_norm": 0.0, "kl": 0.2515842616558075, "learning_rate": 2.509193603264402e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9054 }, { "completion_length": 564.5, "epoch": 2.50970066518847, "grad_norm": 0.0, "kl": 0.197929248213768, "learning_rate": 2.508755814467546e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9055 }, { "completion_length": 607.25, "epoch": 2.5099778270509976, "grad_norm": 2.9306418895721436, "kl": 0.20037594437599182, "learning_rate": 2.508318025402184e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9056 }, { "completion_length": 683.0, "epoch": 2.5102549889135255, "grad_norm": 0.7108168601989746, "kl": 6774195.0, "learning_rate": 2.507880236081744e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9057 }, { "completion_length": 684.25, "epoch": 2.5105321507760534, "grad_norm": 0.0, "kl": 157794682535936.0, "learning_rate": 2.5074424465196496e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9058 }, { "completion_length": 680.25, "epoch": 2.510809312638581, "grad_norm": 0.0, "kl": 0.2386801540851593, "learning_rate": 2.5070046567293265e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9059 }, { "completion_length": 577.25, "epoch": 2.511086474501109, "grad_norm": 0.0, "kl": 0.2061990350484848, "learning_rate": 2.5065668667241993e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9060 }, { "completion_length": 707.75, "epoch": 2.5113636363636362, "grad_norm": 0.0, "kl": 0.16145561635494232, "learning_rate": 2.506129076517694e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9061 }, { "completion_length": 753.0, "epoch": 2.511640798226164, "grad_norm": 0.0, "kl": 0.23542311787605286, "learning_rate": 2.5056912861232356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9062 }, { "completion_length": 563.75, "epoch": 2.5119179600886916, "grad_norm": 0.0, "kl": 0.6783937215805054, "learning_rate": 2.5052534955542483e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9063 }, { "completion_length": 652.75, "epoch": 2.5121951219512195, "grad_norm": 0.0, "kl": 0.30556589365005493, "learning_rate": 2.5048157048241588e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9064 }, { "completion_length": 699.5, "epoch": 2.5124722838137474, "grad_norm": 0.0, "kl": 0.16121187806129456, "learning_rate": 2.5043779139463915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9065 }, { "completion_length": 721.25, "epoch": 2.512749445676275, "grad_norm": 0.0, "kl": 0.31418493390083313, "learning_rate": 2.5039401229343717e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9066 }, { "completion_length": 669.75, "epoch": 2.513026607538803, "grad_norm": 0.0, "kl": 0.18590988218784332, "learning_rate": 2.503502331801525e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9067 }, { "completion_length": 582.75, "epoch": 2.5133037694013303, "grad_norm": 0.0, "kl": 0.7501149773597717, "learning_rate": 2.503064540561276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9068 }, { "completion_length": 729.5, "epoch": 2.513580931263858, "grad_norm": 0.0, "kl": 0.15246324241161346, "learning_rate": 2.5026267492270505e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9069 }, { "completion_length": 663.0, "epoch": 2.5138580931263856, "grad_norm": 0.0, "kl": 0.18789012730121613, "learning_rate": 2.5021889578122745e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9070 }, { "completion_length": 683.25, "epoch": 2.5141352549889135, "grad_norm": 0.0, "kl": 0.1606217473745346, "learning_rate": 2.501751166330371e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9071 }, { "completion_length": 747.75, "epoch": 2.5144124168514415, "grad_norm": 0.0, "kl": 0.3482196629047394, "learning_rate": 2.5013133747947665e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9072 }, { "completion_length": 746.5, "epoch": 2.514689578713969, "grad_norm": 0.0, "kl": 4.739926338195801, "learning_rate": 2.5008755832188866e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9073 }, { "completion_length": 700.25, "epoch": 2.514966740576497, "grad_norm": 0.0, "kl": 0.4044709801673889, "learning_rate": 2.500437791616156e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9074 }, { "completion_length": 698.5, "epoch": 2.5152439024390243, "grad_norm": 0.0, "kl": 495272.21875, "learning_rate": 2.5e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9075 }, { "completion_length": 775.25, "epoch": 2.515521064301552, "grad_norm": 0.0, "kl": 46345.0546875, "learning_rate": 2.4995622083838443e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9076 }, { "completion_length": 711.75, "epoch": 2.5157982261640797, "grad_norm": 0.0, "kl": 0.22433163225650787, "learning_rate": 2.4991244167811143e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9077 }, { "completion_length": 562.75, "epoch": 2.5160753880266076, "grad_norm": 0.0, "kl": 0.22238582372665405, "learning_rate": 2.4986866252052344e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9078 }, { "completion_length": 646.0, "epoch": 2.5163525498891355, "grad_norm": 0.0, "kl": 0.20859594643115997, "learning_rate": 2.4982488336696304e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9079 }, { "completion_length": 642.5, "epoch": 2.516629711751663, "grad_norm": 0.0, "kl": 0.1799740046262741, "learning_rate": 2.4978110421877267e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9080 }, { "completion_length": 759.5, "epoch": 2.516906873614191, "grad_norm": 0.0, "kl": 3.3192834854125977, "learning_rate": 2.4973732507729503e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9081 }, { "completion_length": 751.75, "epoch": 2.5171840354767183, "grad_norm": 0.0, "kl": 0.15517687797546387, "learning_rate": 2.4969354594387243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9082 }, { "completion_length": 804.25, "epoch": 2.5174611973392462, "grad_norm": 0.0, "kl": 0.17085011303424835, "learning_rate": 2.4964976681984752e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9083 }, { "completion_length": 710.0, "epoch": 2.5177383592017737, "grad_norm": 0.0, "kl": 0.24518714845180511, "learning_rate": 2.4960598770656287e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9084 }, { "completion_length": 630.25, "epoch": 2.5180155210643016, "grad_norm": 0.0, "kl": 0.1763681173324585, "learning_rate": 2.495622086053609e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9085 }, { "completion_length": 604.5, "epoch": 2.5182926829268295, "grad_norm": 0.0, "kl": 0.49357670545578003, "learning_rate": 2.495184295175842e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9086 }, { "completion_length": 629.5, "epoch": 2.518569844789357, "grad_norm": 0.0, "kl": 0.19381242990493774, "learning_rate": 2.494746504445752e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9087 }, { "completion_length": 694.75, "epoch": 2.5188470066518844, "grad_norm": 0.0, "kl": 0.16383622586727142, "learning_rate": 2.4943087138767656e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9088 }, { "completion_length": 615.75, "epoch": 2.5191241685144123, "grad_norm": 0.0, "kl": 0.19999603927135468, "learning_rate": 2.493870923482307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9089 }, { "completion_length": 669.5, "epoch": 2.5194013303769403, "grad_norm": 0.0, "kl": 247.02943420410156, "learning_rate": 2.4934331332758006e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9090 }, { "completion_length": 646.75, "epoch": 2.5196784922394677, "grad_norm": 0.0, "kl": 2889.9150390625, "learning_rate": 2.4929953432706743e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9091 }, { "completion_length": 626.5, "epoch": 2.5199556541019956, "grad_norm": 0.0, "kl": 9.714346885681152, "learning_rate": 2.4925575534803504e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9092 }, { "completion_length": 533.75, "epoch": 2.5202328159645235, "grad_norm": 0.0, "kl": 0.21676428616046906, "learning_rate": 2.4921197639182566e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9093 }, { "completion_length": 705.75, "epoch": 2.520509977827051, "grad_norm": 0.0, "kl": 352324386816.0, "learning_rate": 2.491681974597816e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9094 }, { "completion_length": 703.5, "epoch": 2.5207871396895785, "grad_norm": 0.0, "kl": 0.16236819326877594, "learning_rate": 2.4912441855324556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9095 }, { "completion_length": 676.0, "epoch": 2.5210643015521064, "grad_norm": 0.0, "kl": 4115102.5, "learning_rate": 2.4908063967355984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9096 }, { "completion_length": 621.75, "epoch": 2.5213414634146343, "grad_norm": 0.5971347093582153, "kl": 399359.03125, "learning_rate": 2.490368608220672e-06, "loss": 0.0, "reward": 5.5, "reward_std": 0.5, "rewards/confident_score_func": 1.75, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9097 }, { "completion_length": 684.5, "epoch": 2.5216186252771617, "grad_norm": 0.0, "kl": 0.25490543246269226, "learning_rate": 2.489930820001099e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9098 }, { "completion_length": 680.75, "epoch": 2.5218957871396896, "grad_norm": 0.0, "kl": 0.18206137418746948, "learning_rate": 2.489493032090306e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9099 }, { "completion_length": 628.5, "epoch": 2.5221729490022176, "grad_norm": 0.0, "kl": 0.20685705542564392, "learning_rate": 2.4890552445017195e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9100 }, { "completion_length": 798.0, "epoch": 2.522450110864745, "grad_norm": 0.0, "kl": 0.16076666116714478, "learning_rate": 2.488617457248761e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9101 }, { "completion_length": 672.25, "epoch": 2.5227272727272725, "grad_norm": 0.3988569676876068, "kl": 0.2309620976448059, "learning_rate": 2.48817967034486e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9102 }, { "completion_length": 728.0, "epoch": 2.5230044345898004, "grad_norm": 0.5204070210456848, "kl": 65429632.0, "learning_rate": 2.4877418838034373e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9103 }, { "completion_length": 576.5, "epoch": 2.5232815964523283, "grad_norm": 2.960768699645996, "kl": 0.2495303899049759, "learning_rate": 2.4873040976379215e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9104 }, { "completion_length": 695.75, "epoch": 2.5235587583148558, "grad_norm": 1.4002803564071655, "kl": 7638443.5, "learning_rate": 2.4868663118617355e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9105 }, { "completion_length": 705.0, "epoch": 2.5238359201773837, "grad_norm": 0.0, "kl": 0.19497661292552948, "learning_rate": 2.4864285264883047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9106 }, { "completion_length": 545.5, "epoch": 2.5241130820399116, "grad_norm": 0.0, "kl": 0.37723004817962646, "learning_rate": 2.4859907415310555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9107 }, { "completion_length": 748.0, "epoch": 2.524390243902439, "grad_norm": 0.36229515075683594, "kl": 0.18671700358390808, "learning_rate": 2.4855529570034113e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9108 }, { "completion_length": 748.75, "epoch": 2.5246674057649665, "grad_norm": 0.0, "kl": 255.72679138183594, "learning_rate": 2.4851151729187984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9109 }, { "completion_length": 712.5, "epoch": 2.5249445676274944, "grad_norm": 0.0, "kl": 0.1675824224948883, "learning_rate": 2.4846773892906404e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9110 }, { "completion_length": 528.5, "epoch": 2.5252217294900223, "grad_norm": 0.0, "kl": 6590661.0, "learning_rate": 2.4842396061323643e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9111 }, { "completion_length": 628.0, "epoch": 2.52549889135255, "grad_norm": 0.9557952284812927, "kl": 9216413663232.0, "learning_rate": 2.4838018234573925e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9112 }, { "completion_length": 679.75, "epoch": 2.5257760532150777, "grad_norm": 3.227490186691284, "kl": 55706.16015625, "learning_rate": 2.4833640412791517e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9113 }, { "completion_length": 717.5, "epoch": 2.526053215077605, "grad_norm": 0.0, "kl": 0.17396950721740723, "learning_rate": 2.4829262596110674e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9114 }, { "completion_length": 731.25, "epoch": 2.526330376940133, "grad_norm": 0.3679889142513275, "kl": 0.26450836658477783, "learning_rate": 2.482488478466563e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9115 }, { "completion_length": 661.25, "epoch": 2.5266075388026605, "grad_norm": 0.0, "kl": 0.34152576327323914, "learning_rate": 2.4820506978590647e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9116 }, { "completion_length": 611.0, "epoch": 2.5268847006651884, "grad_norm": 0.6424809694290161, "kl": 44.1206169128418, "learning_rate": 2.4816129178019965e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9117 }, { "completion_length": 593.0, "epoch": 2.5271618625277164, "grad_norm": 0.0, "kl": 0.2004433423280716, "learning_rate": 2.4811751383087836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9118 }, { "completion_length": 658.0, "epoch": 2.527439024390244, "grad_norm": 0.0, "kl": 0.21505209803581238, "learning_rate": 2.480737359392851e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9119 }, { "completion_length": 633.25, "epoch": 2.5277161862527717, "grad_norm": 0.0, "kl": 0.9307634830474854, "learning_rate": 2.4802995810676227e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9120 }, { "completion_length": 718.75, "epoch": 2.527993348115299, "grad_norm": 0.5030681490898132, "kl": 40903460782080.0, "learning_rate": 2.4798618033465256e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9121 }, { "completion_length": 545.75, "epoch": 2.528270509977827, "grad_norm": 0.0, "kl": 0.2513318359851837, "learning_rate": 2.479424026242982e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9122 }, { "completion_length": 633.0, "epoch": 2.5285476718403546, "grad_norm": 0.0, "kl": 0.20685474574565887, "learning_rate": 2.4789862497704187e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9123 }, { "completion_length": 594.0, "epoch": 2.5288248337028825, "grad_norm": 0.0, "kl": 0.2767717242240906, "learning_rate": 2.4785484739422594e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9124 }, { "completion_length": 701.5, "epoch": 2.5291019955654104, "grad_norm": 0.0, "kl": 0.14318853616714478, "learning_rate": 2.4781106987719296e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9125 }, { "completion_length": 614.75, "epoch": 2.529379157427938, "grad_norm": 7.130221843719482, "kl": 0.1788163185119629, "learning_rate": 2.477672924272853e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9126 }, { "completion_length": 626.25, "epoch": 2.5296563192904657, "grad_norm": 0.0, "kl": 0.17360348999500275, "learning_rate": 2.4772351504584557e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9127 }, { "completion_length": 777.0, "epoch": 2.529933481152993, "grad_norm": 0.0, "kl": 0.1720477193593979, "learning_rate": 2.476797377342161e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9128 }, { "completion_length": 698.25, "epoch": 2.530210643015521, "grad_norm": 0.0, "kl": 0.1659737229347229, "learning_rate": 2.476359604937394e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9129 }, { "completion_length": 641.0, "epoch": 2.5304878048780486, "grad_norm": 0.0, "kl": 0.2113347053527832, "learning_rate": 2.4759218332575804e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9130 }, { "completion_length": 737.0, "epoch": 2.5307649667405765, "grad_norm": 0.0, "kl": 9.910520553588867, "learning_rate": 2.4754840623161426e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9131 }, { "completion_length": 690.25, "epoch": 2.5310421286031044, "grad_norm": 0.0, "kl": 0.22116664052009583, "learning_rate": 2.475046292126509e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9132 }, { "completion_length": 575.25, "epoch": 2.531319290465632, "grad_norm": 0.0, "kl": 0.20733056962490082, "learning_rate": 2.4746085227020995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9133 }, { "completion_length": 540.75, "epoch": 2.5315964523281598, "grad_norm": 0.0, "kl": 0.22006413340568542, "learning_rate": 2.4741707540563425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9134 }, { "completion_length": 782.75, "epoch": 2.5318736141906872, "grad_norm": 0.0, "kl": 0.5446171164512634, "learning_rate": 2.47373298620266e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9135 }, { "completion_length": 570.25, "epoch": 2.532150776053215, "grad_norm": 0.0, "kl": 0.20077744126319885, "learning_rate": 2.4732952191544774e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9136 }, { "completion_length": 576.75, "epoch": 2.5324279379157426, "grad_norm": 0.0, "kl": 0.20195336639881134, "learning_rate": 2.4728574529252203e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9137 }, { "completion_length": 725.75, "epoch": 2.5327050997782705, "grad_norm": 0.0, "kl": 0.22686967253684998, "learning_rate": 2.4724196875283116e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9138 }, { "completion_length": 813.0, "epoch": 2.5329822616407984, "grad_norm": 0.0, "kl": 36693840.0, "learning_rate": 2.471981922977177e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9139 }, { "completion_length": 656.25, "epoch": 2.533259423503326, "grad_norm": 0.0, "kl": 0.21873606741428375, "learning_rate": 2.4715441592852398e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9140 }, { "completion_length": 663.0, "epoch": 2.533536585365854, "grad_norm": 0.0, "kl": 0.2001948058605194, "learning_rate": 2.471106396465925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9141 }, { "completion_length": 658.5, "epoch": 2.5338137472283813, "grad_norm": 0.0, "kl": 1.6455568075180054, "learning_rate": 2.470668634532656e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9142 }, { "completion_length": 587.5, "epoch": 2.534090909090909, "grad_norm": 0.0, "kl": 0.19825470447540283, "learning_rate": 2.4702308734988585e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9143 }, { "completion_length": 689.75, "epoch": 2.5343680709534366, "grad_norm": 0.35765525698661804, "kl": 262239764348928.0, "learning_rate": 2.4697931133779566e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9144 }, { "completion_length": 647.75, "epoch": 2.5346452328159645, "grad_norm": 0.0, "kl": 0.1814805567264557, "learning_rate": 2.4693553541833742e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9145 }, { "completion_length": 606.5, "epoch": 2.5349223946784925, "grad_norm": 0.0, "kl": 1.4122267961502075, "learning_rate": 2.468917595928536e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9146 }, { "completion_length": 997.0, "epoch": 2.53519955654102, "grad_norm": 0.23673631250858307, "kl": 0.17209024727344513, "learning_rate": 2.468479838626865e-06, "loss": 0.0, "reward": 4.09375, "reward_std": 3.3125, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 9147 }, { "completion_length": 627.75, "epoch": 2.535476718403548, "grad_norm": 0.0, "kl": 22118300.0, "learning_rate": 2.468042082291787e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9148 }, { "completion_length": 623.75, "epoch": 2.5357538802660753, "grad_norm": 0.0, "kl": 0.2599058151245117, "learning_rate": 2.4676043269367245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9149 }, { "completion_length": 651.25, "epoch": 2.536031042128603, "grad_norm": 0.0, "kl": 0.20132705569267273, "learning_rate": 2.4671665725751025e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9150 }, { "completion_length": 601.5, "epoch": 2.5363082039911307, "grad_norm": 0.0, "kl": 0.18420687317848206, "learning_rate": 2.4667288192203464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9151 }, { "completion_length": 716.5, "epoch": 2.5365853658536586, "grad_norm": 0.0, "kl": 0.34638917446136475, "learning_rate": 2.4662910668858774e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9152 }, { "completion_length": 922.5, "epoch": 2.5368625277161865, "grad_norm": 0.0, "kl": 0.19991713762283325, "learning_rate": 2.465853315585123e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9153 }, { "completion_length": 608.25, "epoch": 2.537139689578714, "grad_norm": 0.0, "kl": 0.2185225784778595, "learning_rate": 2.465415565331504e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9154 }, { "completion_length": 668.0, "epoch": 2.537416851441242, "grad_norm": 0.0, "kl": 0.18322357535362244, "learning_rate": 2.464977816138446e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9155 }, { "completion_length": 624.75, "epoch": 2.5376940133037693, "grad_norm": 0.0, "kl": 0.2026352435350418, "learning_rate": 2.4645400680193724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9156 }, { "completion_length": 667.5, "epoch": 2.537971175166297, "grad_norm": 0.5307745933532715, "kl": 133548367413248.0, "learning_rate": 2.4641023209877084e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9157 }, { "completion_length": 673.5, "epoch": 2.5382483370288247, "grad_norm": 0.0, "kl": 0.19898292422294617, "learning_rate": 2.463664575056876e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9158 }, { "completion_length": 642.0, "epoch": 2.5385254988913526, "grad_norm": 0.0, "kl": 0.19755896925926208, "learning_rate": 2.4632268302403e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9159 }, { "completion_length": 524.25, "epoch": 2.5388026607538805, "grad_norm": 0.0, "kl": 9600073990144.0, "learning_rate": 2.462789086551405e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9160 }, { "completion_length": 686.5, "epoch": 2.539079822616408, "grad_norm": 0.0, "kl": 0.2020532190799713, "learning_rate": 2.462351344003612e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9161 }, { "completion_length": 515.5, "epoch": 2.5393569844789354, "grad_norm": 0.0, "kl": 0.22789789736270905, "learning_rate": 2.461913602610349e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9162 }, { "completion_length": 716.0, "epoch": 2.5396341463414633, "grad_norm": 0.0, "kl": 0.15240807831287384, "learning_rate": 2.461475862385036e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9163 }, { "completion_length": 645.0, "epoch": 2.5399113082039912, "grad_norm": 0.0, "kl": 0.2319241464138031, "learning_rate": 2.461038123341099e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9164 }, { "completion_length": 653.5, "epoch": 2.5401884700665187, "grad_norm": 0.0, "kl": 0.226224884390831, "learning_rate": 2.4606003854919595e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9165 }, { "completion_length": 633.5, "epoch": 2.5404656319290466, "grad_norm": 0.7451850175857544, "kl": 8663404.0, "learning_rate": 2.4601626488510426e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9166 }, { "completion_length": 562.75, "epoch": 2.5407427937915745, "grad_norm": 0.0, "kl": 0.18674957752227783, "learning_rate": 2.459724913431772e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9167 }, { "completion_length": 675.0, "epoch": 2.541019955654102, "grad_norm": 0.0, "kl": 0.1848025918006897, "learning_rate": 2.4592871792475705e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9168 }, { "completion_length": 653.5, "epoch": 2.5412971175166295, "grad_norm": 0.7223089933395386, "kl": 161780697399296.0, "learning_rate": 2.458849446311862e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9169 }, { "completion_length": 608.0, "epoch": 2.5415742793791574, "grad_norm": 0.0, "kl": 0.1901414543390274, "learning_rate": 2.4584117146380695e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9170 }, { "completion_length": 583.5, "epoch": 2.5418514412416853, "grad_norm": 0.0, "kl": 257330429952.0, "learning_rate": 2.4579739842396177e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9171 }, { "completion_length": 715.0, "epoch": 2.5421286031042127, "grad_norm": 0.0, "kl": 0.19061875343322754, "learning_rate": 2.4575362551299277e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9172 }, { "completion_length": 814.25, "epoch": 2.5424057649667406, "grad_norm": 0.3245374858379364, "kl": 0.15058492124080658, "learning_rate": 2.457098527322424e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9173 }, { "completion_length": 608.5, "epoch": 2.5426829268292686, "grad_norm": 0.0, "kl": 2752188.25, "learning_rate": 2.456660800830531e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9174 }, { "completion_length": 640.75, "epoch": 2.542960088691796, "grad_norm": 6.0941081047058105, "kl": 20868644.0, "learning_rate": 2.45622307566767e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9175 }, { "completion_length": 767.25, "epoch": 2.5432372505543235, "grad_norm": 0.0, "kl": 0.17049653828144073, "learning_rate": 2.455785351847266e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9176 }, { "completion_length": 667.25, "epoch": 2.5435144124168514, "grad_norm": 0.0, "kl": 0.19678211212158203, "learning_rate": 2.4553476293827406e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9177 }, { "completion_length": 653.0, "epoch": 2.5437915742793793, "grad_norm": 0.0, "kl": 1.5049270391464233, "learning_rate": 2.454909908287519e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9178 }, { "completion_length": 691.0, "epoch": 2.5440687361419068, "grad_norm": 0.0, "kl": 0.18360383808612823, "learning_rate": 2.4544721885750217e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9179 }, { "completion_length": 631.75, "epoch": 2.5443458980044347, "grad_norm": 0.0, "kl": 0.22905908524990082, "learning_rate": 2.454034470258673e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9180 }, { "completion_length": 637.75, "epoch": 2.5446230598669626, "grad_norm": 0.0, "kl": 0.20993494987487793, "learning_rate": 2.4535967533518965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9181 }, { "completion_length": 572.25, "epoch": 2.54490022172949, "grad_norm": 0.0, "kl": 0.22521522641181946, "learning_rate": 2.4531590378681135e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9182 }, { "completion_length": 827.25, "epoch": 2.5451773835920175, "grad_norm": 0.0, "kl": 0.17362307012081146, "learning_rate": 2.4527213238207497e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9183 }, { "completion_length": 765.25, "epoch": 2.5454545454545454, "grad_norm": 0.0, "kl": 0.17607100307941437, "learning_rate": 2.4522836112232243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9184 }, { "completion_length": 778.25, "epoch": 2.5457317073170733, "grad_norm": 0.0, "kl": 0.19312481582164764, "learning_rate": 2.4518459000889635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9185 }, { "completion_length": 826.25, "epoch": 2.546008869179601, "grad_norm": 0.0, "kl": 0.1962297558784485, "learning_rate": 2.4514081904313877e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9186 }, { "completion_length": 715.5, "epoch": 2.5462860310421287, "grad_norm": 0.0, "kl": 1.0359851121902466, "learning_rate": 2.450970482263921e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9187 }, { "completion_length": 555.0, "epoch": 2.546563192904656, "grad_norm": 0.0, "kl": 3754991104.0, "learning_rate": 2.4505327755999857e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9188 }, { "completion_length": 642.0, "epoch": 2.546840354767184, "grad_norm": 0.0, "kl": 0.1745622456073761, "learning_rate": 2.450095070453004e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9189 }, { "completion_length": 700.75, "epoch": 2.5471175166297115, "grad_norm": 0.3291291892528534, "kl": 0.18874181807041168, "learning_rate": 2.4496573668364e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9190 }, { "completion_length": 643.0, "epoch": 2.5473946784922394, "grad_norm": 0.0, "kl": 0.2151467204093933, "learning_rate": 2.449219664763594e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9191 }, { "completion_length": 605.0, "epoch": 2.5476718403547673, "grad_norm": 0.0, "kl": 0.31867486238479614, "learning_rate": 2.44878196424801e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9192 }, { "completion_length": 665.75, "epoch": 2.547949002217295, "grad_norm": 0.0, "kl": 2340900.5, "learning_rate": 2.4483442653030695e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9193 }, { "completion_length": 689.5, "epoch": 2.5482261640798227, "grad_norm": 0.0, "kl": 0.19196473062038422, "learning_rate": 2.447906567942197e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9194 }, { "completion_length": 781.5, "epoch": 2.54850332594235, "grad_norm": 0.0, "kl": 0.2237248420715332, "learning_rate": 2.4474688721788116e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9195 }, { "completion_length": 706.0, "epoch": 2.548780487804878, "grad_norm": 0.0, "kl": 0.20679855346679688, "learning_rate": 2.447031178026338e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9196 }, { "completion_length": 602.75, "epoch": 2.5490576496674056, "grad_norm": 0.0, "kl": 111318896.0, "learning_rate": 2.4465934854981985e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9197 }, { "completion_length": 713.5, "epoch": 2.5493348115299335, "grad_norm": 0.0, "kl": 0.19081611931324005, "learning_rate": 2.446155794607814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9198 }, { "completion_length": 646.75, "epoch": 2.5496119733924614, "grad_norm": 0.0, "kl": 27209852.0, "learning_rate": 2.4457181053686076e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9199 }, { "completion_length": 718.75, "epoch": 2.549889135254989, "grad_norm": 0.0, "kl": 0.25877290964126587, "learning_rate": 2.4452804177940008e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9200 }, { "completion_length": 617.0, "epoch": 2.5501662971175167, "grad_norm": 0.0, "kl": 0.24523672461509705, "learning_rate": 2.4448427318974166e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9201 }, { "completion_length": 771.75, "epoch": 2.550443458980044, "grad_norm": 0.0, "kl": 0.21158410608768463, "learning_rate": 2.444405047692276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9202 }, { "completion_length": 643.0, "epoch": 2.550720620842572, "grad_norm": 0.0, "kl": 0.17794090509414673, "learning_rate": 2.443967365192e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9203 }, { "completion_length": 743.25, "epoch": 2.5509977827050996, "grad_norm": 3.030231475830078, "kl": 408365957120.0, "learning_rate": 2.443529684410014e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9204 }, { "completion_length": 706.25, "epoch": 2.5512749445676275, "grad_norm": 5.622204780578613, "kl": 3261289725952.0, "learning_rate": 2.443092005359736e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9205 }, { "completion_length": 637.25, "epoch": 2.5515521064301554, "grad_norm": 5.966109752655029, "kl": 10082161.0, "learning_rate": 2.4426543280545903e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9206 }, { "completion_length": 668.25, "epoch": 2.551829268292683, "grad_norm": 0.0, "kl": 0.23277167975902557, "learning_rate": 2.4422166525079974e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9207 }, { "completion_length": 625.5, "epoch": 2.5521064301552108, "grad_norm": 0.0, "kl": 0.1923375427722931, "learning_rate": 2.44177897873338e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9208 }, { "completion_length": 641.75, "epoch": 2.5523835920177382, "grad_norm": 0.0, "kl": 0.27336832880973816, "learning_rate": 2.4413413067441586e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9209 }, { "completion_length": 714.5, "epoch": 2.552660753880266, "grad_norm": 0.0, "kl": 0.2478509247303009, "learning_rate": 2.4409036365537553e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9210 }, { "completion_length": 635.0, "epoch": 2.5529379157427936, "grad_norm": 0.0, "kl": 0.2235402762889862, "learning_rate": 2.440465968175592e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9211 }, { "completion_length": 742.25, "epoch": 2.5532150776053215, "grad_norm": 0.0, "kl": 0.7446072697639465, "learning_rate": 2.4400283016230893e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9212 }, { "completion_length": 598.25, "epoch": 2.5534922394678494, "grad_norm": 0.0, "kl": 0.2149576097726822, "learning_rate": 2.43959063690967e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9213 }, { "completion_length": 683.25, "epoch": 2.553769401330377, "grad_norm": 0.0, "kl": 0.2024037390947342, "learning_rate": 2.4391529740487533e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9214 }, { "completion_length": 776.75, "epoch": 2.554046563192905, "grad_norm": 0.0, "kl": 0.17556191980838776, "learning_rate": 2.4387153130537633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9215 }, { "completion_length": 559.0, "epoch": 2.5543237250554323, "grad_norm": 0.0, "kl": 5.076413154602051, "learning_rate": 2.438277653938118e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9216 }, { "completion_length": 616.25, "epoch": 2.55460088691796, "grad_norm": 0.0, "kl": 6.107576370239258, "learning_rate": 2.437839996715241e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9217 }, { "completion_length": 665.75, "epoch": 2.5548780487804876, "grad_norm": 0.0, "kl": 0.2150379717350006, "learning_rate": 2.4374023413985532e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9218 }, { "completion_length": 746.0, "epoch": 2.5551552106430155, "grad_norm": 0.0, "kl": 0.2536360025405884, "learning_rate": 2.4369646880014746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9219 }, { "completion_length": 620.75, "epoch": 2.5554323725055434, "grad_norm": 0.0, "kl": 0.2370428740978241, "learning_rate": 2.4365270365374277e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9220 }, { "completion_length": 738.5, "epoch": 2.555709534368071, "grad_norm": 0.0, "kl": 0.15822555124759674, "learning_rate": 2.4360893870198316e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9221 }, { "completion_length": 677.25, "epoch": 2.555986696230599, "grad_norm": 0.0, "kl": 0.25699347257614136, "learning_rate": 2.4356517394621088e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9222 }, { "completion_length": 681.25, "epoch": 2.5562638580931263, "grad_norm": 0.0, "kl": 0.2038307636976242, "learning_rate": 2.4352140938776792e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9223 }, { "completion_length": 541.25, "epoch": 2.556541019955654, "grad_norm": 0.0, "kl": 0.34730827808380127, "learning_rate": 2.4347764502799637e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9224 }, { "completion_length": 609.0, "epoch": 2.5568181818181817, "grad_norm": 0.0, "kl": 0.24974775314331055, "learning_rate": 2.4343388086823828e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9225 }, { "completion_length": 618.0, "epoch": 2.5570953436807096, "grad_norm": 0.0, "kl": 3134138299187200.0, "learning_rate": 2.433901169098358e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9226 }, { "completion_length": 559.25, "epoch": 2.5573725055432375, "grad_norm": 0.0, "kl": 0.44103121757507324, "learning_rate": 2.4334635315413096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9227 }, { "completion_length": 619.25, "epoch": 2.557649667405765, "grad_norm": 0.0, "kl": 0.2842353880405426, "learning_rate": 2.433025896024657e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9228 }, { "completion_length": 650.5, "epoch": 2.557926829268293, "grad_norm": 0.0, "kl": 0.2854539453983307, "learning_rate": 2.4325882625618223e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9229 }, { "completion_length": 655.25, "epoch": 2.5582039911308203, "grad_norm": 0.0, "kl": 0.20435193181037903, "learning_rate": 2.432150631166225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9230 }, { "completion_length": 758.25, "epoch": 2.558481152993348, "grad_norm": 0.0, "kl": 0.15653342008590698, "learning_rate": 2.431713001851286e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9231 }, { "completion_length": 726.75, "epoch": 2.5587583148558757, "grad_norm": 0.33503803610801697, "kl": 1.17481314189312e+16, "learning_rate": 2.4312753746304248e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9232 }, { "completion_length": 626.0, "epoch": 2.5590354767184036, "grad_norm": 0.0, "kl": 0.2246519774198532, "learning_rate": 2.430837749517061e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9233 }, { "completion_length": 649.25, "epoch": 2.5593126385809315, "grad_norm": 0.0, "kl": 0.2268892526626587, "learning_rate": 2.4304001265246178e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9234 }, { "completion_length": 712.5, "epoch": 2.559589800443459, "grad_norm": 0.0, "kl": 0.18433509767055511, "learning_rate": 2.429962505666511e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9235 }, { "completion_length": 675.0, "epoch": 2.5598669623059864, "grad_norm": 0.4254568815231323, "kl": 36766618746880.0, "learning_rate": 2.4295248869561643e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9236 }, { "completion_length": 631.25, "epoch": 2.5601441241685143, "grad_norm": 2.560455322265625, "kl": 15579303936.0, "learning_rate": 2.4290872704069955e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9237 }, { "completion_length": 585.75, "epoch": 2.5604212860310422, "grad_norm": 0.0, "kl": 0.483538419008255, "learning_rate": 2.4286496560324257e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9238 }, { "completion_length": 731.25, "epoch": 2.5606984478935697, "grad_norm": 0.0, "kl": 0.21353311836719513, "learning_rate": 2.428212043845874e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9239 }, { "completion_length": 672.0, "epoch": 2.5609756097560976, "grad_norm": 0.0, "kl": 0.2898395359516144, "learning_rate": 2.4277744338607593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9240 }, { "completion_length": 592.5, "epoch": 2.5612527716186255, "grad_norm": 0.0, "kl": 0.3492463231086731, "learning_rate": 2.4273368260905034e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9241 }, { "completion_length": 726.25, "epoch": 2.561529933481153, "grad_norm": 0.0, "kl": 0.3000757396221161, "learning_rate": 2.4268992205485243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9242 }, { "completion_length": 735.25, "epoch": 2.5618070953436805, "grad_norm": 0.0, "kl": 0.1794128715991974, "learning_rate": 2.4264616172482425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9243 }, { "completion_length": 727.25, "epoch": 2.5620842572062084, "grad_norm": 0.0, "kl": 0.20440857112407684, "learning_rate": 2.4260240162030757e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9244 }, { "completion_length": 602.75, "epoch": 2.5623614190687363, "grad_norm": 0.0, "kl": 0.39819109439849854, "learning_rate": 2.4255864174264467e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9245 }, { "completion_length": 779.75, "epoch": 2.5626385809312637, "grad_norm": 0.0, "kl": 0.37342962622642517, "learning_rate": 2.4251488209317705e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9246 }, { "completion_length": 574.5, "epoch": 2.5629157427937916, "grad_norm": 0.0, "kl": 0.34277060627937317, "learning_rate": 2.4247112267324693e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9247 }, { "completion_length": 719.0, "epoch": 2.5631929046563195, "grad_norm": 0.0, "kl": 0.23192618787288666, "learning_rate": 2.4242736348419623e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9248 }, { "completion_length": 656.75, "epoch": 2.563470066518847, "grad_norm": 0.0, "kl": 0.8156840801239014, "learning_rate": 2.4238360452736672e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9249 }, { "completion_length": 648.0, "epoch": 2.5637472283813745, "grad_norm": 0.0, "kl": 0.30270570516586304, "learning_rate": 2.4233984580410045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9250 }, { "completion_length": 675.25, "epoch": 2.5640243902439024, "grad_norm": 0.0, "kl": 23.335195541381836, "learning_rate": 2.4229608731573916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9251 }, { "completion_length": 648.0, "epoch": 2.5643015521064303, "grad_norm": 0.0, "kl": 2.9974308013916016, "learning_rate": 2.4225232906362493e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9252 }, { "completion_length": 664.75, "epoch": 2.5645787139689578, "grad_norm": 0.0, "kl": 4.355504512786865, "learning_rate": 2.4220857104909946e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9253 }, { "completion_length": 687.0, "epoch": 2.5648558758314857, "grad_norm": 0.0, "kl": 0.7796372175216675, "learning_rate": 2.4216481327350475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9254 }, { "completion_length": 646.25, "epoch": 2.5651330376940136, "grad_norm": 0.0, "kl": 0.1901649683713913, "learning_rate": 2.421210557381825e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9255 }, { "completion_length": 611.0, "epoch": 2.565410199556541, "grad_norm": 0.515617311000824, "kl": 0.21240997314453125, "learning_rate": 2.4207729844447474e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9256 }, { "completion_length": 621.0, "epoch": 2.5656873614190685, "grad_norm": 0.0, "kl": 0.2210962325334549, "learning_rate": 2.420335413937234e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9257 }, { "completion_length": 628.5, "epoch": 2.5659645232815964, "grad_norm": 0.0, "kl": 0.350768119096756, "learning_rate": 2.4198978458727006e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9258 }, { "completion_length": 583.5, "epoch": 2.5662416851441243, "grad_norm": 0.0, "kl": 0.41049516201019287, "learning_rate": 2.4194602802645684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9259 }, { "completion_length": 599.75, "epoch": 2.566518847006652, "grad_norm": 0.0, "kl": 10.677083969116211, "learning_rate": 2.4190227171262534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9260 }, { "completion_length": 718.0, "epoch": 2.5667960088691797, "grad_norm": 0.0, "kl": 0.24678578972816467, "learning_rate": 2.4185851564711756e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9261 }, { "completion_length": 759.5, "epoch": 2.567073170731707, "grad_norm": 1.6314700841903687, "kl": 3013.6044921875, "learning_rate": 2.4181475983127515e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9262 }, { "completion_length": 684.25, "epoch": 2.567350332594235, "grad_norm": 0.0, "kl": 0.2053249329328537, "learning_rate": 2.4177100426644e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9263 }, { "completion_length": 729.25, "epoch": 2.5676274944567625, "grad_norm": 0.0, "kl": 0.1902131289243698, "learning_rate": 2.4172724895395403e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9264 }, { "completion_length": 671.25, "epoch": 2.5679046563192904, "grad_norm": 0.0, "kl": 0.21278060972690582, "learning_rate": 2.4168349389515875e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9265 }, { "completion_length": 706.75, "epoch": 2.5681818181818183, "grad_norm": 0.0, "kl": 0.3119522035121918, "learning_rate": 2.4163973909139633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9266 }, { "completion_length": 714.5, "epoch": 2.568458980044346, "grad_norm": 0.3510656952857971, "kl": 1280249622953984.0, "learning_rate": 2.4159598454400816e-06, "loss": 0.0, "reward": 4.625, "reward_std": 2.25, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9267 }, { "completion_length": 619.0, "epoch": 2.5687361419068737, "grad_norm": 0.0, "kl": 0.2183375209569931, "learning_rate": 2.4155223025433625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9268 }, { "completion_length": 641.0, "epoch": 2.569013303769401, "grad_norm": 0.0, "kl": 0.33398470282554626, "learning_rate": 2.415084762237223e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9269 }, { "completion_length": 637.25, "epoch": 2.569290465631929, "grad_norm": 0.0, "kl": 12.793486595153809, "learning_rate": 2.4146472245350804e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9270 }, { "completion_length": 636.5, "epoch": 2.5695676274944566, "grad_norm": 0.0, "kl": 0.18951618671417236, "learning_rate": 2.4142096894503526e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9271 }, { "completion_length": 655.5, "epoch": 2.5698447893569845, "grad_norm": NaN, "kl": 113.49471282958984, "learning_rate": 2.4137721569964566e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9272 }, { "completion_length": 690.5, "epoch": 2.5701219512195124, "grad_norm": 0.0, "kl": 0.24708032608032227, "learning_rate": 2.4137721569964566e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9273 }, { "completion_length": 738.0, "epoch": 2.57039911308204, "grad_norm": 4.524354934692383, "kl": 14702763573248.0, "learning_rate": 2.4133346271868103e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9274 }, { "completion_length": 641.75, "epoch": 2.5706762749445677, "grad_norm": 0.4901737570762634, "kl": 64782456.0, "learning_rate": 2.41289710003483e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9275 }, { "completion_length": 718.5, "epoch": 2.570953436807095, "grad_norm": 0.0, "kl": 1001041.8125, "learning_rate": 2.412459575553934e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9276 }, { "completion_length": 792.0, "epoch": 2.571230598669623, "grad_norm": 0.0, "kl": 0.1722627580165863, "learning_rate": 2.412022053757537e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9277 }, { "completion_length": 512.25, "epoch": 2.5715077605321506, "grad_norm": 0.0, "kl": 0.2784384489059448, "learning_rate": 2.4115845346590585e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9278 }, { "completion_length": 758.75, "epoch": 2.5717849223946785, "grad_norm": 0.0, "kl": 0.19822247326374054, "learning_rate": 2.4111470182719147e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9279 }, { "completion_length": 736.0, "epoch": 2.5720620842572064, "grad_norm": 0.0, "kl": 0.24722129106521606, "learning_rate": 2.410709504609522e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9280 }, { "completion_length": 696.75, "epoch": 2.572339246119734, "grad_norm": 0.0, "kl": 0.31092679500579834, "learning_rate": 2.410271993685298e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9281 }, { "completion_length": 614.5, "epoch": 2.5726164079822618, "grad_norm": 0.0, "kl": 0.21091295778751373, "learning_rate": 2.409834485512658e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9282 }, { "completion_length": 662.5, "epoch": 2.5728935698447892, "grad_norm": 0.0, "kl": 0.267178475856781, "learning_rate": 2.40939698010502e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9283 }, { "completion_length": 733.5, "epoch": 2.573170731707317, "grad_norm": 0.0, "kl": 0.3049662411212921, "learning_rate": 2.408959477475799e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9284 }, { "completion_length": 754.75, "epoch": 2.5734478935698446, "grad_norm": 0.0, "kl": 0.2551538348197937, "learning_rate": 2.4085219776384116e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9285 }, { "completion_length": 675.5, "epoch": 2.5737250554323725, "grad_norm": 0.0, "kl": 0.25083059072494507, "learning_rate": 2.4080844806062764e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9286 }, { "completion_length": 642.25, "epoch": 2.5740022172949004, "grad_norm": 0.0, "kl": 0.22201231122016907, "learning_rate": 2.407646986392806e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9287 }, { "completion_length": 592.25, "epoch": 2.574279379157428, "grad_norm": 0.0, "kl": 0.2659514546394348, "learning_rate": 2.4072094950114195e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9288 }, { "completion_length": 744.75, "epoch": 2.574556541019956, "grad_norm": 0.0, "kl": 0.21274985373020172, "learning_rate": 2.4067720064755314e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9289 }, { "completion_length": 653.75, "epoch": 2.5748337028824833, "grad_norm": 0.5315492749214172, "kl": 0.40295934677124023, "learning_rate": 2.406334520798558e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9290 }, { "completion_length": 678.25, "epoch": 2.575110864745011, "grad_norm": 0.0, "kl": 0.8390595316886902, "learning_rate": 2.405897037993915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9291 }, { "completion_length": 691.25, "epoch": 2.5753880266075386, "grad_norm": 0.0, "kl": 1965754454900736.0, "learning_rate": 2.4054595580750193e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9292 }, { "completion_length": 699.5, "epoch": 2.5756651884700665, "grad_norm": 0.0, "kl": 1.4924559593200684, "learning_rate": 2.4050220810552848e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9293 }, { "completion_length": 677.25, "epoch": 2.5759423503325944, "grad_norm": 0.0, "kl": 0.30175545811653137, "learning_rate": 2.404584606948128e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9294 }, { "completion_length": 677.25, "epoch": 2.576219512195122, "grad_norm": 0.0, "kl": 0.19065526127815247, "learning_rate": 2.404147135766965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9295 }, { "completion_length": 1028.0, "epoch": 2.57649667405765, "grad_norm": 0.0, "kl": 0.21369008719921112, "learning_rate": 2.403709667525209e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9296 }, { "completion_length": 649.25, "epoch": 2.5767738359201773, "grad_norm": 0.0, "kl": 0.19309017062187195, "learning_rate": 2.403272202236279e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9297 }, { "completion_length": 605.5, "epoch": 2.577050997782705, "grad_norm": 0.0, "kl": 0.2574910819530487, "learning_rate": 2.402834739913587e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9298 }, { "completion_length": 627.75, "epoch": 2.5773281596452327, "grad_norm": 0.0, "kl": 1.244051218032837, "learning_rate": 2.4023972805705503e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9299 }, { "completion_length": 699.25, "epoch": 2.5776053215077606, "grad_norm": 0.0, "kl": 660840.625, "learning_rate": 2.4019598242205818e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9300 }, { "completion_length": 713.75, "epoch": 2.5778824833702885, "grad_norm": 0.0, "kl": 3770729224994816.0, "learning_rate": 2.401522370877098e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9301 }, { "completion_length": 684.5, "epoch": 2.578159645232816, "grad_norm": 2.4446005821228027, "kl": 551029.9375, "learning_rate": 2.4010849205535137e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9302 }, { "completion_length": 721.0, "epoch": 2.578436807095344, "grad_norm": 0.0, "kl": 19955802112.0, "learning_rate": 2.400647473263243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9303 }, { "completion_length": 617.25, "epoch": 2.5787139689578713, "grad_norm": 0.0, "kl": 0.6613286137580872, "learning_rate": 2.4002100290197017e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9304 }, { "completion_length": 676.75, "epoch": 2.578991130820399, "grad_norm": 0.0, "kl": 0.21684204041957855, "learning_rate": 2.3997725878363032e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9305 }, { "completion_length": 747.0, "epoch": 2.5792682926829267, "grad_norm": 0.3808949887752533, "kl": 390918259605504.0, "learning_rate": 2.399335149726463e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9306 }, { "completion_length": 633.5, "epoch": 2.5795454545454546, "grad_norm": 3.7196202278137207, "kl": 497592395890688.0, "learning_rate": 2.398897714703594e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9307 }, { "completion_length": 589.75, "epoch": 2.5798226164079825, "grad_norm": 0.0, "kl": 7758.93603515625, "learning_rate": 2.398460282781112e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9308 }, { "completion_length": 1147.75, "epoch": 2.58009977827051, "grad_norm": 1.3857662677764893, "kl": 172731907702784.0, "learning_rate": 2.398022853972431e-06, "loss": 0.0, "reward": 4.09375, "reward_std": 3.3125, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 9309 }, { "completion_length": 641.5, "epoch": 2.5803769401330374, "grad_norm": 0.0, "kl": 0.22290325164794922, "learning_rate": 2.3975854282909645e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9310 }, { "completion_length": 723.0, "epoch": 2.5806541019955653, "grad_norm": 0.0, "kl": 0.2921786904335022, "learning_rate": 2.397148005750127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9311 }, { "completion_length": 677.75, "epoch": 2.5809312638580932, "grad_norm": 0.0, "kl": 0.24554330110549927, "learning_rate": 2.3967105863633326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9312 }, { "completion_length": 644.75, "epoch": 2.5812084257206207, "grad_norm": 0.0, "kl": 0.2924548089504242, "learning_rate": 2.396273170143995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9313 }, { "completion_length": 611.25, "epoch": 2.5814855875831486, "grad_norm": 0.0, "kl": 0.2555459439754486, "learning_rate": 2.395835757105527e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9314 }, { "completion_length": 640.0, "epoch": 2.5817627494456765, "grad_norm": 0.0, "kl": 0.3320360481739044, "learning_rate": 2.3953983472613427e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9315 }, { "completion_length": 629.0, "epoch": 2.582039911308204, "grad_norm": 0.0, "kl": 0.3087846636772156, "learning_rate": 2.3949609406248576e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9316 }, { "completion_length": 617.25, "epoch": 2.5823170731707314, "grad_norm": 0.0, "kl": 0.2207096815109253, "learning_rate": 2.394523537209482e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9317 }, { "completion_length": 717.75, "epoch": 2.5825942350332594, "grad_norm": 0.0, "kl": 0.32881543040275574, "learning_rate": 2.3940861370286323e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9318 }, { "completion_length": 785.5, "epoch": 2.5828713968957873, "grad_norm": 0.39209258556365967, "kl": 396875245027328.0, "learning_rate": 2.3936487400957186e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9319 }, { "completion_length": 643.25, "epoch": 2.5831485587583147, "grad_norm": 0.4594663977622986, "kl": 1836650556555264.0, "learning_rate": 2.3932113464241565e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9320 }, { "completion_length": 619.25, "epoch": 2.5834257206208426, "grad_norm": 0.0, "kl": 0.2433934509754181, "learning_rate": 2.3927739560273576e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9321 }, { "completion_length": 763.75, "epoch": 2.5837028824833705, "grad_norm": 0.0, "kl": 0.2612975537776947, "learning_rate": 2.392336568918736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9322 }, { "completion_length": 651.25, "epoch": 2.583980044345898, "grad_norm": 0.0, "kl": 0.24079294502735138, "learning_rate": 2.3918991851117036e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9323 }, { "completion_length": 711.0, "epoch": 2.5842572062084255, "grad_norm": 0.0, "kl": 0.20765180885791779, "learning_rate": 2.3914618046196735e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9324 }, { "completion_length": 617.0, "epoch": 2.5845343680709534, "grad_norm": 0.0, "kl": 0.26022571325302124, "learning_rate": 2.391024427456059e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9325 }, { "completion_length": 683.75, "epoch": 2.5848115299334813, "grad_norm": 0.0, "kl": 0.17565421760082245, "learning_rate": 2.3905870536342705e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9326 }, { "completion_length": 741.5, "epoch": 2.5850886917960088, "grad_norm": 0.0, "kl": 0.19952233135700226, "learning_rate": 2.3901496831677237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9327 }, { "completion_length": 740.0, "epoch": 2.5853658536585367, "grad_norm": 0.0, "kl": 0.29637640714645386, "learning_rate": 2.3897123160698277e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9328 }, { "completion_length": 675.5, "epoch": 2.5856430155210646, "grad_norm": 0.0, "kl": 0.22428150475025177, "learning_rate": 2.389274952353998e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9329 }, { "completion_length": 724.5, "epoch": 2.585920177383592, "grad_norm": 0.0, "kl": 1164586422108160.0, "learning_rate": 2.3888375920336428e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9330 }, { "completion_length": 690.25, "epoch": 2.5861973392461195, "grad_norm": 0.0, "kl": 0.1824522167444229, "learning_rate": 2.3884002351221764e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9331 }, { "completion_length": 632.25, "epoch": 2.5864745011086474, "grad_norm": 0.0, "kl": 0.1896010786294937, "learning_rate": 2.387962881633012e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9332 }, { "completion_length": 1007.75, "epoch": 2.5867516629711753, "grad_norm": 0.24675586819648743, "kl": 0.17418310046195984, "learning_rate": 2.387525531579559e-06, "loss": 0.0, "reward": 4.09375, "reward_std": 3.3125, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 9333 }, { "completion_length": 767.5, "epoch": 2.587028824833703, "grad_norm": 0.0, "kl": 0.33033517003059387, "learning_rate": 2.3870881849752304e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9334 }, { "completion_length": 681.0, "epoch": 2.5873059866962307, "grad_norm": 0.0, "kl": 0.2067069411277771, "learning_rate": 2.386650841833437e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9335 }, { "completion_length": 606.25, "epoch": 2.587583148558758, "grad_norm": 0.0, "kl": 0.22432616353034973, "learning_rate": 2.3862135021675916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9336 }, { "completion_length": 692.25, "epoch": 2.587860310421286, "grad_norm": 0.0, "kl": 0.1952669471502304, "learning_rate": 2.3857761659911043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9337 }, { "completion_length": 641.0, "epoch": 2.5881374722838135, "grad_norm": 0.0, "kl": 0.32252997159957886, "learning_rate": 2.385338833317386e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9338 }, { "completion_length": 718.5, "epoch": 2.5884146341463414, "grad_norm": 0.0, "kl": 0.2321963757276535, "learning_rate": 2.3849015041598497e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9339 }, { "completion_length": 681.25, "epoch": 2.5886917960088693, "grad_norm": 0.0, "kl": 0.8384418487548828, "learning_rate": 2.3844641785319054e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9340 }, { "completion_length": 555.75, "epoch": 2.588968957871397, "grad_norm": 0.41014009714126587, "kl": 5469589848719360.0, "learning_rate": 2.3840268564469644e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9341 }, { "completion_length": 714.0, "epoch": 2.5892461197339247, "grad_norm": 0.0, "kl": 0.20217670500278473, "learning_rate": 2.3835895379184365e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9342 }, { "completion_length": 852.25, "epoch": 2.589523281596452, "grad_norm": 0.0, "kl": 0.15414360165596008, "learning_rate": 2.383152222959734e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9343 }, { "completion_length": 700.75, "epoch": 2.58980044345898, "grad_norm": 0.0, "kl": 14.676596641540527, "learning_rate": 2.382714911584266e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9344 }, { "completion_length": 710.25, "epoch": 2.5900776053215075, "grad_norm": 0.3585191071033478, "kl": 0.21735075116157532, "learning_rate": 2.382277603805444e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9345 }, { "completion_length": 652.0, "epoch": 2.5903547671840355, "grad_norm": 0.0, "kl": 439824864.0, "learning_rate": 2.3818402996366786e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9346 }, { "completion_length": 807.75, "epoch": 2.5906319290465634, "grad_norm": 0.3517661690711975, "kl": 0.1923995316028595, "learning_rate": 2.3814029990913785e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9347 }, { "completion_length": 740.25, "epoch": 2.590909090909091, "grad_norm": 0.0, "kl": 11539338.0, "learning_rate": 2.380965702182957e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9348 }, { "completion_length": 535.25, "epoch": 2.5911862527716187, "grad_norm": 0.0, "kl": 0.2644067704677582, "learning_rate": 2.3805284089248203e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9349 }, { "completion_length": 781.25, "epoch": 2.591463414634146, "grad_norm": 0.0, "kl": 193956075798528.0, "learning_rate": 2.380091119330382e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9350 }, { "completion_length": 671.0, "epoch": 2.591740576496674, "grad_norm": 0.0, "kl": 0.21294935047626495, "learning_rate": 2.379653833413049e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9351 }, { "completion_length": 739.25, "epoch": 2.5920177383592016, "grad_norm": 0.0, "kl": 0.1882258504629135, "learning_rate": 2.379216551186233e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9352 }, { "completion_length": 807.25, "epoch": 2.5922949002217295, "grad_norm": 0.0, "kl": 0.16791768372058868, "learning_rate": 2.3787792726633426e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9353 }, { "completion_length": 644.75, "epoch": 2.5925720620842574, "grad_norm": 0.0, "kl": 0.1724826842546463, "learning_rate": 2.3783419978577872e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9354 }, { "completion_length": 643.5, "epoch": 2.592849223946785, "grad_norm": 0.0, "kl": 0.20815350115299225, "learning_rate": 2.3779047267829775e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9355 }, { "completion_length": 737.75, "epoch": 2.5931263858093128, "grad_norm": 3.0071449279785156, "kl": 0.18746599555015564, "learning_rate": 2.3774674594523214e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9356 }, { "completion_length": 742.5, "epoch": 2.5934035476718402, "grad_norm": 0.0, "kl": 1342175.625, "learning_rate": 2.3770301958792293e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9357 }, { "completion_length": 655.75, "epoch": 2.593680709534368, "grad_norm": 0.0, "kl": 6778273363132416.0, "learning_rate": 2.376592936077108e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9358 }, { "completion_length": 730.5, "epoch": 2.5939578713968956, "grad_norm": 0.0, "kl": 0.19194549322128296, "learning_rate": 2.37615568005937e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9359 }, { "completion_length": 650.5, "epoch": 2.5942350332594235, "grad_norm": 0.0, "kl": 235773974544384.0, "learning_rate": 2.3757184278394207e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9360 }, { "completion_length": 675.75, "epoch": 2.5945121951219514, "grad_norm": 0.0, "kl": 0.21794652938842773, "learning_rate": 2.37528117943067e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9361 }, { "completion_length": 772.25, "epoch": 2.594789356984479, "grad_norm": 0.0, "kl": 0.6019402146339417, "learning_rate": 2.374843934846528e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9362 }, { "completion_length": 558.0, "epoch": 2.595066518847007, "grad_norm": 0.0, "kl": 0.5998426079750061, "learning_rate": 2.374406694100401e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9363 }, { "completion_length": 762.5, "epoch": 2.5953436807095343, "grad_norm": 0.0, "kl": 0.2077951729297638, "learning_rate": 2.3739694572056987e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9364 }, { "completion_length": 618.0, "epoch": 2.595620842572062, "grad_norm": 0.0, "kl": 0.20459285378456116, "learning_rate": 2.3735322241758286e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9365 }, { "completion_length": 660.25, "epoch": 2.5958980044345896, "grad_norm": 0.0, "kl": 3245576594391040.0, "learning_rate": 2.3730949950241992e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9366 }, { "completion_length": 629.5, "epoch": 2.5961751662971175, "grad_norm": 0.0, "kl": 0.28545236587524414, "learning_rate": 2.3726577697642183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9367 }, { "completion_length": 690.75, "epoch": 2.5964523281596454, "grad_norm": 0.0, "kl": 0.23296336829662323, "learning_rate": 2.3722205484092934e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9368 }, { "completion_length": 667.25, "epoch": 2.596729490022173, "grad_norm": 0.0, "kl": 0.22022271156311035, "learning_rate": 2.371783330972834e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9369 }, { "completion_length": 559.5, "epoch": 2.597006651884701, "grad_norm": 0.0, "kl": 0.2304043024778366, "learning_rate": 2.371346117468245e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9370 }, { "completion_length": 612.25, "epoch": 2.5972838137472283, "grad_norm": 0.0, "kl": 0.203156515955925, "learning_rate": 2.3709089079089365e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9371 }, { "completion_length": 773.25, "epoch": 2.597560975609756, "grad_norm": 0.0, "kl": 0.21095512807369232, "learning_rate": 2.370471702308314e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9372 }, { "completion_length": 657.75, "epoch": 2.5978381374722836, "grad_norm": 0.0, "kl": 0.18893443048000336, "learning_rate": 2.3700345006797863e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9373 }, { "completion_length": 596.75, "epoch": 2.5981152993348116, "grad_norm": 0.0, "kl": 0.2620087265968323, "learning_rate": 2.3695973030367593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9374 }, { "completion_length": 799.5, "epoch": 2.5983924611973395, "grad_norm": 0.0, "kl": 0.17881333827972412, "learning_rate": 2.3691601093926406e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9375 }, { "completion_length": 606.5, "epoch": 2.598669623059867, "grad_norm": 0.0, "kl": 0.19789016246795654, "learning_rate": 2.3687229197608373e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9376 }, { "completion_length": 742.5, "epoch": 2.598946784922395, "grad_norm": 0.0, "kl": 0.19099611043930054, "learning_rate": 2.368285734154755e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9377 }, { "completion_length": 601.5, "epoch": 2.5992239467849223, "grad_norm": 0.0, "kl": 0.16905860602855682, "learning_rate": 2.3678485525878035e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9378 }, { "completion_length": 757.5, "epoch": 2.59950110864745, "grad_norm": 0.0, "kl": 0.22026923298835754, "learning_rate": 2.3674113750733847e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9379 }, { "completion_length": 697.25, "epoch": 2.5997782705099777, "grad_norm": 0.0, "kl": 0.19736254215240479, "learning_rate": 2.36697420162491e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9380 }, { "completion_length": 562.75, "epoch": 2.6000554323725056, "grad_norm": 0.0, "kl": 0.21882081031799316, "learning_rate": 2.366537032255781e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9381 }, { "completion_length": 820.25, "epoch": 2.6003325942350335, "grad_norm": 0.0, "kl": 0.14774149656295776, "learning_rate": 2.3660998669794062e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9382 }, { "completion_length": 496.0, "epoch": 2.600609756097561, "grad_norm": 0.0, "kl": 0.27329903841018677, "learning_rate": 2.3656627058091924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9383 }, { "completion_length": 720.0, "epoch": 2.6008869179600884, "grad_norm": 0.0, "kl": 0.407543808221817, "learning_rate": 2.365225548758544e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9384 }, { "completion_length": 712.0, "epoch": 2.6011640798226163, "grad_norm": 0.0, "kl": 0.16741743683815002, "learning_rate": 2.3647883958408684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9385 }, { "completion_length": 702.0, "epoch": 2.6014412416851442, "grad_norm": 0.0, "kl": 0.4853592813014984, "learning_rate": 2.364351247069569e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9386 }, { "completion_length": 642.75, "epoch": 2.6017184035476717, "grad_norm": 0.0, "kl": 0.23897196352481842, "learning_rate": 2.3639141024580534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9387 }, { "completion_length": 642.5, "epoch": 2.6019955654101996, "grad_norm": 0.5255652070045471, "kl": 88180360.0, "learning_rate": 2.3634769620197253e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9388 }, { "completion_length": 808.25, "epoch": 2.6022727272727275, "grad_norm": 0.0, "kl": 0.18453559279441833, "learning_rate": 2.3630398257679917e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9389 }, { "completion_length": 667.0, "epoch": 2.602549889135255, "grad_norm": 0.0, "kl": 0.5385985374450684, "learning_rate": 2.362602693716256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9390 }, { "completion_length": 716.0, "epoch": 2.6028270509977824, "grad_norm": 0.0, "kl": 815452.0, "learning_rate": 2.3621655658779246e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9391 }, { "completion_length": 746.5, "epoch": 2.6031042128603104, "grad_norm": 0.0, "kl": 0.533202588558197, "learning_rate": 2.3617284422664023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9392 }, { "completion_length": 760.5, "epoch": 2.6033813747228383, "grad_norm": 1.5186998844146729, "kl": 43.951725006103516, "learning_rate": 2.361291322895093e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9393 }, { "completion_length": 615.5, "epoch": 2.6036585365853657, "grad_norm": 0.0, "kl": 0.2285853922367096, "learning_rate": 2.3608542077774023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9394 }, { "completion_length": 582.25, "epoch": 2.6039356984478936, "grad_norm": 0.0, "kl": 0.32254818081855774, "learning_rate": 2.3604170969267333e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9395 }, { "completion_length": 580.25, "epoch": 2.6042128603104215, "grad_norm": 0.0, "kl": 0.21011921763420105, "learning_rate": 2.3599799903564923e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9396 }, { "completion_length": 626.25, "epoch": 2.604490022172949, "grad_norm": 0.3830587863922119, "kl": 13598556.0, "learning_rate": 2.359542888080082e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9397 }, { "completion_length": 673.5, "epoch": 2.6047671840354765, "grad_norm": 0.0, "kl": 60647.2109375, "learning_rate": 2.3591057901109063e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9398 }, { "completion_length": 616.5, "epoch": 2.6050443458980044, "grad_norm": 0.0, "kl": 0.24570591747760773, "learning_rate": 2.358668696462372e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9399 }, { "completion_length": 684.25, "epoch": 2.6053215077605323, "grad_norm": 0.0, "kl": 0.17172367870807648, "learning_rate": 2.3582316071478785e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9400 }, { "completion_length": 684.0, "epoch": 2.6055986696230597, "grad_norm": 0.0, "kl": 0.16260690987110138, "learning_rate": 2.3577945221808334e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9401 }, { "completion_length": 836.75, "epoch": 2.6058758314855877, "grad_norm": 0.0, "kl": 2064849148313600.0, "learning_rate": 2.357357441574638e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9402 }, { "completion_length": 666.25, "epoch": 2.6061529933481156, "grad_norm": 0.0, "kl": 0.16992229223251343, "learning_rate": 2.356920365342697e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9403 }, { "completion_length": 640.5, "epoch": 2.606430155210643, "grad_norm": 0.0, "kl": 0.2492520660161972, "learning_rate": 2.356483293498413e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9404 }, { "completion_length": 703.0, "epoch": 2.6067073170731705, "grad_norm": 0.0, "kl": 0.20081037282943726, "learning_rate": 2.356046226055189e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9405 }, { "completion_length": 580.5, "epoch": 2.6069844789356984, "grad_norm": 0.0, "kl": 0.33180537819862366, "learning_rate": 2.3556091630264294e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9406 }, { "completion_length": 755.75, "epoch": 2.6072616407982263, "grad_norm": 0.0, "kl": 0.362468957901001, "learning_rate": 2.3551721044255353e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9407 }, { "completion_length": 624.5, "epoch": 2.6075388026607538, "grad_norm": 0.0, "kl": 0.2359585464000702, "learning_rate": 2.3547350502659107e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9408 }, { "completion_length": 642.0, "epoch": 2.6078159645232817, "grad_norm": 1.8374284505844116, "kl": 7586533058019328.0, "learning_rate": 2.354298000560957e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9409 }, { "completion_length": 625.5, "epoch": 2.608093126385809, "grad_norm": 0.0, "kl": 0.22376511991024017, "learning_rate": 2.353860955324079e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9410 }, { "completion_length": 732.5, "epoch": 2.608370288248337, "grad_norm": 0.0, "kl": 0.20442399382591248, "learning_rate": 2.353423914568676e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9411 }, { "completion_length": 643.75, "epoch": 2.6086474501108645, "grad_norm": 0.0, "kl": 0.410661906003952, "learning_rate": 2.352986878308152e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9412 }, { "completion_length": 650.0, "epoch": 2.6089246119733924, "grad_norm": 0.0, "kl": 0.23876953125, "learning_rate": 2.3525498465559104e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9413 }, { "completion_length": 703.25, "epoch": 2.6092017738359203, "grad_norm": 0.0, "kl": 0.5828090906143188, "learning_rate": 2.35211281932535e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9414 }, { "completion_length": 819.75, "epoch": 2.609478935698448, "grad_norm": 0.0, "kl": 0.1430436670780182, "learning_rate": 2.3516757966298755e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9415 }, { "completion_length": 748.0, "epoch": 2.6097560975609757, "grad_norm": 0.0, "kl": 0.7145260572433472, "learning_rate": 2.351238778482886e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9416 }, { "completion_length": 678.0, "epoch": 2.610033259423503, "grad_norm": 0.0, "kl": 0.188397616147995, "learning_rate": 2.3508017648977855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9417 }, { "completion_length": 762.5, "epoch": 2.610310421286031, "grad_norm": 0.0, "kl": 0.250912070274353, "learning_rate": 2.3503647558879736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9418 }, { "completion_length": 648.25, "epoch": 2.6105875831485585, "grad_norm": 0.0, "kl": 0.24623975157737732, "learning_rate": 2.349927751466853e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9419 }, { "completion_length": 700.25, "epoch": 2.6108647450110865, "grad_norm": 0.0, "kl": 0.28993192315101624, "learning_rate": 2.349490751647823e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9420 }, { "completion_length": 801.5, "epoch": 2.6111419068736144, "grad_norm": 0.0, "kl": 0.29686641693115234, "learning_rate": 2.349053756444285e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9421 }, { "completion_length": 664.0, "epoch": 2.611419068736142, "grad_norm": 0.0, "kl": 0.16723215579986572, "learning_rate": 2.348616765869641e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9422 }, { "completion_length": 694.0, "epoch": 2.6116962305986697, "grad_norm": 0.0, "kl": 0.18992014229297638, "learning_rate": 2.3481797799372913e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9423 }, { "completion_length": 602.5, "epoch": 2.611973392461197, "grad_norm": 0.0, "kl": 0.23493441939353943, "learning_rate": 2.3477427986606366e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9424 }, { "completion_length": 737.0, "epoch": 2.612250554323725, "grad_norm": 0.0, "kl": 0.185634046792984, "learning_rate": 2.347305822053076e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9425 }, { "completion_length": 751.75, "epoch": 2.6125277161862526, "grad_norm": 0.0, "kl": 0.22942687571048737, "learning_rate": 2.3468688501280115e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9426 }, { "completion_length": 665.25, "epoch": 2.6128048780487805, "grad_norm": 0.0, "kl": 0.20557595789432526, "learning_rate": 2.3464318828988416e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9427 }, { "completion_length": 697.75, "epoch": 2.6130820399113084, "grad_norm": 0.0, "kl": 0.23291955888271332, "learning_rate": 2.3459949203789666e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9428 }, { "completion_length": 736.25, "epoch": 2.613359201773836, "grad_norm": 0.0, "kl": 0.25554540753364563, "learning_rate": 2.3455579625817886e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9429 }, { "completion_length": 691.0, "epoch": 2.6136363636363638, "grad_norm": 0.0, "kl": 0.15851737558841705, "learning_rate": 2.3451210095207034e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9430 }, { "completion_length": 614.0, "epoch": 2.613913525498891, "grad_norm": 0.0, "kl": 0.1861128956079483, "learning_rate": 2.3446840612091145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9431 }, { "completion_length": 596.75, "epoch": 2.614190687361419, "grad_norm": 0.0, "kl": 0.2598337233066559, "learning_rate": 2.3442471176604177e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9432 }, { "completion_length": 747.25, "epoch": 2.6144678492239466, "grad_norm": 0.0, "kl": 0.18349182605743408, "learning_rate": 2.3438101788880147e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9433 }, { "completion_length": 665.25, "epoch": 2.6147450110864745, "grad_norm": 0.0, "kl": 0.23074188828468323, "learning_rate": 2.3433732449053035e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9434 }, { "completion_length": 673.0, "epoch": 2.6150221729490024, "grad_norm": 0.0, "kl": 0.18670892715454102, "learning_rate": 2.342936315725683e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9435 }, { "completion_length": 568.5, "epoch": 2.61529933481153, "grad_norm": 0.0, "kl": 1.2208174467086792, "learning_rate": 2.3424993913625534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9436 }, { "completion_length": 763.75, "epoch": 2.615576496674058, "grad_norm": 0.0, "kl": 0.8409369587898254, "learning_rate": 2.3420624718293112e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9437 }, { "completion_length": 749.75, "epoch": 2.6158536585365852, "grad_norm": 0.0, "kl": 0.20411323010921478, "learning_rate": 2.341625557139357e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9438 }, { "completion_length": 720.5, "epoch": 2.616130820399113, "grad_norm": 0.418145090341568, "kl": 19337886.0, "learning_rate": 2.341188647306087e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9439 }, { "completion_length": 726.0, "epoch": 2.6164079822616406, "grad_norm": 0.0, "kl": 0.16417434811592102, "learning_rate": 2.3407517423429016e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9440 }, { "completion_length": 604.75, "epoch": 2.6166851441241685, "grad_norm": 0.0, "kl": 0.21274201571941376, "learning_rate": 2.340314842263196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9441 }, { "completion_length": 703.0, "epoch": 2.6169623059866964, "grad_norm": 3.4957945346832275, "kl": 3566271987712.0, "learning_rate": 2.3398779470803715e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9442 }, { "completion_length": 772.0, "epoch": 2.617239467849224, "grad_norm": 0.0, "kl": 0.15888553857803345, "learning_rate": 2.3394410568078245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9443 }, { "completion_length": 514.0, "epoch": 2.617516629711752, "grad_norm": 0.0, "kl": 0.22691643238067627, "learning_rate": 2.3390041714589516e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9444 }, { "completion_length": 682.0, "epoch": 2.6177937915742793, "grad_norm": 0.0, "kl": 0.2013990581035614, "learning_rate": 2.3385672910471518e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9445 }, { "completion_length": 740.5, "epoch": 2.618070953436807, "grad_norm": 0.0, "kl": 0.1597861796617508, "learning_rate": 2.3381304155858207e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9446 }, { "completion_length": 590.5, "epoch": 2.6183481152993346, "grad_norm": 0.0, "kl": 0.2062702775001526, "learning_rate": 2.337693545088357e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9447 }, { "completion_length": 745.25, "epoch": 2.6186252771618626, "grad_norm": 0.44896191358566284, "kl": 7091615.5, "learning_rate": 2.337256679568157e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9448 }, { "completion_length": 699.25, "epoch": 2.6189024390243905, "grad_norm": 0.0, "kl": 0.18208609521389008, "learning_rate": 2.3368198190386177e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9449 }, { "completion_length": 645.5, "epoch": 2.619179600886918, "grad_norm": 0.0, "kl": 0.2765725553035736, "learning_rate": 2.3363829635131354e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9450 }, { "completion_length": 727.25, "epoch": 2.619456762749446, "grad_norm": 0.0, "kl": 0.20097850263118744, "learning_rate": 2.3359461130051066e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9451 }, { "completion_length": 621.5, "epoch": 2.6197339246119733, "grad_norm": 0.0, "kl": 0.6201403141021729, "learning_rate": 2.335509267527929e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9452 }, { "completion_length": 647.75, "epoch": 2.620011086474501, "grad_norm": 0.0, "kl": 353825984.0, "learning_rate": 2.335072427094997e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9453 }, { "completion_length": 804.25, "epoch": 2.6202882483370287, "grad_norm": 0.0, "kl": 0.17918622493743896, "learning_rate": 2.3346355917197087e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9454 }, { "completion_length": 610.75, "epoch": 2.6205654101995566, "grad_norm": 0.0, "kl": 0.5473978519439697, "learning_rate": 2.3341987614154583e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9455 }, { "completion_length": 780.75, "epoch": 2.6208425720620845, "grad_norm": 0.0, "kl": 0.1999751329421997, "learning_rate": 2.333761936195642e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9456 }, { "completion_length": 655.25, "epoch": 2.621119733924612, "grad_norm": 0.36379173398017883, "kl": 7329866886152192.0, "learning_rate": 2.3333251160736557e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9457 }, { "completion_length": 844.0, "epoch": 2.6213968957871394, "grad_norm": 0.8987220525741577, "kl": 760976.4375, "learning_rate": 2.332888301062895e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9458 }, { "completion_length": 621.5, "epoch": 2.6216740576496673, "grad_norm": 0.0, "kl": 0.19773787260055542, "learning_rate": 2.332451491176755e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9459 }, { "completion_length": 595.0, "epoch": 2.6219512195121952, "grad_norm": 0.45685508847236633, "kl": 7467552800243712.0, "learning_rate": 2.3320146864286296e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9460 }, { "completion_length": 749.5, "epoch": 2.6222283813747227, "grad_norm": 0.0, "kl": 0.20048761367797852, "learning_rate": 2.331577886831917e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9461 }, { "completion_length": 809.75, "epoch": 2.6225055432372506, "grad_norm": 0.0, "kl": 0.1814620941877365, "learning_rate": 2.3311410924000083e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9462 }, { "completion_length": 690.25, "epoch": 2.6227827050997785, "grad_norm": 0.0, "kl": 0.9818145036697388, "learning_rate": 2.3307043031463012e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9463 }, { "completion_length": 585.5, "epoch": 2.623059866962306, "grad_norm": 0.0, "kl": 0.19490018486976624, "learning_rate": 2.3302675190841874e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9464 }, { "completion_length": 791.75, "epoch": 2.6233370288248334, "grad_norm": 11.958169937133789, "kl": 1764129280.0, "learning_rate": 2.329830740227063e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9465 }, { "completion_length": 592.5, "epoch": 2.6236141906873613, "grad_norm": 0.0, "kl": 0.20245905220508575, "learning_rate": 2.3293939665883233e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9466 }, { "completion_length": 692.25, "epoch": 2.6238913525498893, "grad_norm": 0.0, "kl": 0.29441699385643005, "learning_rate": 2.3289571981813595e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9467 }, { "completion_length": 625.75, "epoch": 2.6241685144124167, "grad_norm": 4.5293145179748535, "kl": 401957.625, "learning_rate": 2.3285204350195677e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9468 }, { "completion_length": 764.0, "epoch": 2.6244456762749446, "grad_norm": 0.0, "kl": 0.177511528134346, "learning_rate": 2.32808367711634e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9469 }, { "completion_length": 603.25, "epoch": 2.6247228381374725, "grad_norm": 0.40659573674201965, "kl": 319162912.0, "learning_rate": 2.327646924485072e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9470 }, { "completion_length": 622.75, "epoch": 2.625, "grad_norm": 0.45184391736984253, "kl": 1259239045595136.0, "learning_rate": 2.3272101771391543e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9471 }, { "completion_length": 747.25, "epoch": 2.6252771618625275, "grad_norm": 1.4273428916931152, "kl": 77483.15625, "learning_rate": 2.3267734350919825e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9472 }, { "completion_length": 713.75, "epoch": 2.6255543237250554, "grad_norm": 0.0, "kl": 0.32625874876976013, "learning_rate": 2.326336698356949e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9473 }, { "completion_length": 675.0, "epoch": 2.6258314855875833, "grad_norm": 0.0, "kl": 0.15936826169490814, "learning_rate": 2.3258999669474463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9474 }, { "completion_length": 882.75, "epoch": 2.6261086474501107, "grad_norm": 0.2977827489376068, "kl": 27108394.0, "learning_rate": 2.325463240876868e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9475 }, { "completion_length": 669.75, "epoch": 2.6263858093126387, "grad_norm": 0.6375053524971008, "kl": 57429524.0, "learning_rate": 2.3250265201586055e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9476 }, { "completion_length": 733.75, "epoch": 2.6266629711751666, "grad_norm": 0.0, "kl": 0.16923180222511292, "learning_rate": 2.324589804806052e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9477 }, { "completion_length": 679.75, "epoch": 2.626940133037694, "grad_norm": 0.0, "kl": 0.17447267472743988, "learning_rate": 2.324153094832599e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9478 }, { "completion_length": 557.0, "epoch": 2.6272172949002215, "grad_norm": 0.0, "kl": 0.5623590350151062, "learning_rate": 2.323716390251639e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9479 }, { "completion_length": 679.25, "epoch": 2.6274944567627494, "grad_norm": 0.0, "kl": 0.17575770616531372, "learning_rate": 2.323279691076565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9480 }, { "completion_length": 597.0, "epoch": 2.6277716186252773, "grad_norm": 0.0, "kl": 0.28430551290512085, "learning_rate": 2.3228429973207663e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9481 }, { "completion_length": 675.5, "epoch": 2.6280487804878048, "grad_norm": 0.0, "kl": 0.16200214624404907, "learning_rate": 2.3224063089976375e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9482 }, { "completion_length": 623.5, "epoch": 2.6283259423503327, "grad_norm": 0.0, "kl": 0.2980997562408447, "learning_rate": 2.321969626120567e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9483 }, { "completion_length": 642.0, "epoch": 2.6286031042128606, "grad_norm": 0.0, "kl": 0.16998925805091858, "learning_rate": 2.3215329487029486e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9484 }, { "completion_length": 515.5, "epoch": 2.628880266075388, "grad_norm": 0.0, "kl": 0.20558896660804749, "learning_rate": 2.3210962767581717e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9485 }, { "completion_length": 671.5, "epoch": 2.6291574279379155, "grad_norm": 0.0, "kl": 0.23694346845149994, "learning_rate": 2.320659610299628e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9486 }, { "completion_length": 759.75, "epoch": 2.6294345898004434, "grad_norm": 0.0, "kl": 1.151537640374272e+16, "learning_rate": 2.320222949340708e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9487 }, { "completion_length": 713.75, "epoch": 2.6297117516629713, "grad_norm": 0.0, "kl": 0.17180347442626953, "learning_rate": 2.3197862938948017e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9488 }, { "completion_length": 549.75, "epoch": 2.629988913525499, "grad_norm": 0.9416792988777161, "kl": 26406168.0, "learning_rate": 2.319349643975301e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9489 }, { "completion_length": 734.75, "epoch": 2.6302660753880267, "grad_norm": 0.0, "kl": 0.2126927226781845, "learning_rate": 2.3189129995955944e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9490 }, { "completion_length": 634.25, "epoch": 2.630543237250554, "grad_norm": 0.0, "kl": 0.21839895844459534, "learning_rate": 2.3184763607690734e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9491 }, { "completion_length": 676.25, "epoch": 2.630820399113082, "grad_norm": 0.49774396419525146, "kl": 6326350461796352.0, "learning_rate": 2.3180397275091264e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9492 }, { "completion_length": 625.75, "epoch": 2.6310975609756095, "grad_norm": 0.0, "kl": 56591356.0, "learning_rate": 2.3176030998291457e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9493 }, { "completion_length": 564.25, "epoch": 2.6313747228381374, "grad_norm": 3.1003379821777344, "kl": 63542824.0, "learning_rate": 2.3171664777425174e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9494 }, { "completion_length": 678.0, "epoch": 2.6316518847006654, "grad_norm": 0.0, "kl": 0.23503541946411133, "learning_rate": 2.316729861262633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9495 }, { "completion_length": 675.75, "epoch": 2.631929046563193, "grad_norm": 0.4970041811466217, "kl": 521083616.0, "learning_rate": 2.3162932504028828e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9496 }, { "completion_length": 640.5, "epoch": 2.6322062084257207, "grad_norm": 0.0, "kl": 0.1942899525165558, "learning_rate": 2.315856645176653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9497 }, { "completion_length": 551.0, "epoch": 2.632483370288248, "grad_norm": 0.0, "kl": 0.23117531836032867, "learning_rate": 2.3154200455973345e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9498 }, { "completion_length": 583.25, "epoch": 2.632760532150776, "grad_norm": 0.0, "kl": 0.22559794783592224, "learning_rate": 2.314983451678315e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9499 }, { "completion_length": 688.75, "epoch": 2.6330376940133036, "grad_norm": 0.0, "kl": 20650718658560.0, "learning_rate": 2.314546863432984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9500 }, { "completion_length": 636.5, "epoch": 2.6333148558758315, "grad_norm": 0.0, "kl": 0.26437637209892273, "learning_rate": 2.3141102808747284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9501 }, { "completion_length": 865.25, "epoch": 2.6335920177383594, "grad_norm": 0.0, "kl": 0.27057287096977234, "learning_rate": 2.3136737040169367e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9502 }, { "completion_length": 706.25, "epoch": 2.633869179600887, "grad_norm": 0.0, "kl": 0.21406368911266327, "learning_rate": 2.313237132872999e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9503 }, { "completion_length": 701.5, "epoch": 2.6341463414634148, "grad_norm": 0.0, "kl": 0.23479735851287842, "learning_rate": 2.3128005674563006e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9504 }, { "completion_length": 672.5, "epoch": 2.634423503325942, "grad_norm": 0.0, "kl": 0.19514192640781403, "learning_rate": 2.3123640077802305e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9505 }, { "completion_length": 626.75, "epoch": 2.63470066518847, "grad_norm": 7.000513553619385, "kl": 884594376704.0, "learning_rate": 2.3119274538581755e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9506 }, { "completion_length": 620.25, "epoch": 2.6349778270509976, "grad_norm": 0.0, "kl": 0.2515813708305359, "learning_rate": 2.3114909057035233e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9507 }, { "completion_length": 623.5, "epoch": 2.6352549889135255, "grad_norm": 0.0, "kl": 0.34061020612716675, "learning_rate": 2.3110543633296605e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9508 }, { "completion_length": 684.25, "epoch": 2.6355321507760534, "grad_norm": 0.3735122084617615, "kl": 9704021191294976.0, "learning_rate": 2.3106178267499744e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9509 }, { "completion_length": 602.75, "epoch": 2.635809312638581, "grad_norm": 0.4483698010444641, "kl": 8856201089515520.0, "learning_rate": 2.310181295977852e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9510 }, { "completion_length": 709.5, "epoch": 2.636086474501109, "grad_norm": 0.0, "kl": 0.18260455131530762, "learning_rate": 2.309744771026679e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9511 }, { "completion_length": 673.0, "epoch": 2.6363636363636362, "grad_norm": 0.0, "kl": 0.21691997349262238, "learning_rate": 2.309308251909844e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9512 }, { "completion_length": 589.75, "epoch": 2.636640798226164, "grad_norm": 0.468821257352829, "kl": 3324320768.0, "learning_rate": 2.30887173864073e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9513 }, { "completion_length": 654.0, "epoch": 2.6369179600886916, "grad_norm": 0.0, "kl": 0.20224212110042572, "learning_rate": 2.308435231232726e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9514 }, { "completion_length": 588.0, "epoch": 2.6371951219512195, "grad_norm": 0.0, "kl": 0.1800781935453415, "learning_rate": 2.3079987296992152e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9515 }, { "completion_length": 634.5, "epoch": 2.6374722838137474, "grad_norm": 0.0, "kl": 0.19976018369197845, "learning_rate": 2.3075622340535856e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9516 }, { "completion_length": 805.25, "epoch": 2.637749445676275, "grad_norm": 0.0, "kl": 0.19103048741817474, "learning_rate": 2.3071257443092213e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9517 }, { "completion_length": 670.0, "epoch": 2.638026607538803, "grad_norm": 0.3796848952770233, "kl": 382464064.0, "learning_rate": 2.306689260479508e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9518 }, { "completion_length": 626.75, "epoch": 2.6383037694013303, "grad_norm": 0.0, "kl": 0.539726197719574, "learning_rate": 2.3062527825778318e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9519 }, { "completion_length": 735.75, "epoch": 2.638580931263858, "grad_norm": 0.0, "kl": 0.15489985048770905, "learning_rate": 2.305816310617576e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9520 }, { "completion_length": 648.0, "epoch": 2.6388580931263856, "grad_norm": 0.0, "kl": 0.37561872601509094, "learning_rate": 2.3053798446121267e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9521 }, { "completion_length": 692.75, "epoch": 2.6391352549889135, "grad_norm": 0.0, "kl": 0.20572641491889954, "learning_rate": 2.304943384574867e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9522 }, { "completion_length": 715.75, "epoch": 2.6394124168514415, "grad_norm": 0.0, "kl": 0.14483413100242615, "learning_rate": 2.3045069305191834e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9523 }, { "completion_length": 646.75, "epoch": 2.639689578713969, "grad_norm": 0.0, "kl": 0.19840183854103088, "learning_rate": 2.3040704824584582e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9524 }, { "completion_length": 764.5, "epoch": 2.639966740576497, "grad_norm": 0.0, "kl": 1.1585669985992704e+16, "learning_rate": 2.303634040406076e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9525 }, { "completion_length": 535.75, "epoch": 2.6402439024390243, "grad_norm": 0.410043865442276, "kl": 0.22828616201877594, "learning_rate": 2.303197604375422e-06, "loss": 0.0, "reward": 5.5625, "reward_std": 0.375, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 9526 }, { "completion_length": 656.5, "epoch": 2.640521064301552, "grad_norm": 0.0, "kl": 0.2720610797405243, "learning_rate": 2.302761174379878e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9527 }, { "completion_length": 656.75, "epoch": 2.6407982261640797, "grad_norm": 0.0, "kl": 6968568868503552.0, "learning_rate": 2.3023247504328293e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9528 }, { "completion_length": 734.0, "epoch": 2.6410753880266076, "grad_norm": 0.0, "kl": 0.15693478286266327, "learning_rate": 2.301888332547657e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9529 }, { "completion_length": 684.0, "epoch": 2.6413525498891355, "grad_norm": 0.0, "kl": 0.23838993906974792, "learning_rate": 2.3014519207377466e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9530 }, { "completion_length": 580.5, "epoch": 2.641629711751663, "grad_norm": 0.0, "kl": 0.17160384356975555, "learning_rate": 2.3010155150164787e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9531 }, { "completion_length": 598.75, "epoch": 2.641906873614191, "grad_norm": 0.0, "kl": 0.24141106009483337, "learning_rate": 2.3005791153972366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9532 }, { "completion_length": 668.25, "epoch": 2.6421840354767183, "grad_norm": 0.0, "kl": 0.2904071509838104, "learning_rate": 2.300142721893406e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9533 }, { "completion_length": 556.5, "epoch": 2.6424611973392462, "grad_norm": 0.4210166931152344, "kl": 0.18722175061702728, "learning_rate": 2.2997063345183643e-06, "loss": 0.0, "reward": 4.4375, "reward_std": 2.625, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.6875, "step": 9534 }, { "completion_length": 670.5, "epoch": 2.6427383592017737, "grad_norm": 0.0, "kl": 0.1697842925786972, "learning_rate": 2.2992699532854974e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9535 }, { "completion_length": 829.5, "epoch": 2.6430155210643016, "grad_norm": 0.0, "kl": 0.18779689073562622, "learning_rate": 2.2988335782081854e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9536 }, { "completion_length": 623.25, "epoch": 2.6432926829268295, "grad_norm": 0.0, "kl": 0.22092817723751068, "learning_rate": 2.2983972092998114e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9537 }, { "completion_length": 599.75, "epoch": 2.643569844789357, "grad_norm": 0.0, "kl": 201299017728.0, "learning_rate": 2.297960846573756e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9538 }, { "completion_length": 624.75, "epoch": 2.6438470066518844, "grad_norm": 0.0, "kl": 0.18559026718139648, "learning_rate": 2.2975244900434004e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9539 }, { "completion_length": 710.0, "epoch": 2.6441241685144123, "grad_norm": 0.0, "kl": 3.184863567352295, "learning_rate": 2.297088139722127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9540 }, { "completion_length": 620.5, "epoch": 2.6444013303769403, "grad_norm": 0.0, "kl": 0.20494651794433594, "learning_rate": 2.296651795623316e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9541 }, { "completion_length": 723.75, "epoch": 2.6446784922394677, "grad_norm": 0.0, "kl": 400884.28125, "learning_rate": 2.2962154577603495e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9542 }, { "completion_length": 686.0, "epoch": 2.6449556541019956, "grad_norm": 0.0, "kl": 0.1843319833278656, "learning_rate": 2.2957791261466057e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9543 }, { "completion_length": 728.25, "epoch": 2.6452328159645235, "grad_norm": 0.0, "kl": 2.331592559814453, "learning_rate": 2.2953428007954682e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9544 }, { "completion_length": 687.75, "epoch": 2.645509977827051, "grad_norm": 0.0, "kl": 0.16918417811393738, "learning_rate": 2.294906481720314e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9545 }, { "completion_length": 615.5, "epoch": 2.6457871396895785, "grad_norm": 0.0, "kl": 0.20345818996429443, "learning_rate": 2.294470168934526e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9546 }, { "completion_length": 629.0, "epoch": 2.6460643015521064, "grad_norm": 1.0866566896438599, "kl": 184553728.0, "learning_rate": 2.2940338624514823e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9547 }, { "completion_length": 636.25, "epoch": 2.6463414634146343, "grad_norm": 0.0, "kl": 0.19783665239810944, "learning_rate": 2.293597562284563e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9548 }, { "completion_length": 656.75, "epoch": 2.6466186252771617, "grad_norm": 0.41641363501548767, "kl": 715003456.0, "learning_rate": 2.2931612684471487e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9549 }, { "completion_length": 566.5, "epoch": 2.6468957871396896, "grad_norm": 0.0, "kl": 0.2673836052417755, "learning_rate": 2.2927249809526174e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9550 }, { "completion_length": 572.75, "epoch": 2.6471729490022176, "grad_norm": 0.0, "kl": 0.195217564702034, "learning_rate": 2.292288699814349e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9551 }, { "completion_length": 699.0, "epoch": 2.647450110864745, "grad_norm": 0.0, "kl": 0.17699162662029266, "learning_rate": 2.2918524250457217e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9552 }, { "completion_length": 569.75, "epoch": 2.6477272727272725, "grad_norm": 0.0, "kl": 0.21437320113182068, "learning_rate": 2.291416156660115e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9553 }, { "completion_length": 545.5, "epoch": 2.6480044345898004, "grad_norm": 0.0, "kl": 0.24409367144107819, "learning_rate": 2.2909798946709063e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9554 }, { "completion_length": 642.5, "epoch": 2.6482815964523283, "grad_norm": 0.0, "kl": 1.8802490234375, "learning_rate": 2.2905436390914753e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9555 }, { "completion_length": 671.75, "epoch": 2.6485587583148558, "grad_norm": 0.0, "kl": 0.2403729110956192, "learning_rate": 2.2901073899351997e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9556 }, { "completion_length": 634.5, "epoch": 2.6488359201773837, "grad_norm": 0.0, "kl": 1.0721925497055054, "learning_rate": 2.289671147215457e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9557 }, { "completion_length": 567.5, "epoch": 2.6491130820399116, "grad_norm": 0.0, "kl": 0.18322157859802246, "learning_rate": 2.2892349109456257e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9558 }, { "completion_length": 596.75, "epoch": 2.649390243902439, "grad_norm": 0.0, "kl": 34609880367104.0, "learning_rate": 2.2887986811390826e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9559 }, { "completion_length": 633.5, "epoch": 2.6496674057649665, "grad_norm": 0.0, "kl": 0.23160353302955627, "learning_rate": 2.2883624578092058e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9560 }, { "completion_length": 621.5, "epoch": 2.6499445676274944, "grad_norm": 0.0, "kl": 0.20203512907028198, "learning_rate": 2.2879262409693717e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9561 }, { "completion_length": 636.25, "epoch": 2.6502217294900223, "grad_norm": 2.9794857501983643, "kl": 2.3834689080918016e+16, "learning_rate": 2.2874900306329565e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9562 }, { "completion_length": 643.5, "epoch": 2.65049889135255, "grad_norm": 0.0, "kl": 0.255147248506546, "learning_rate": 2.2870538268133406e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9563 }, { "completion_length": 656.5, "epoch": 2.6507760532150777, "grad_norm": 0.0, "kl": 0.1943817287683487, "learning_rate": 2.2866176295238955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9564 }, { "completion_length": 610.5, "epoch": 2.651053215077605, "grad_norm": 0.0, "kl": 0.18905557692050934, "learning_rate": 2.286181438778002e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9565 }, { "completion_length": 627.0, "epoch": 2.651330376940133, "grad_norm": 0.0, "kl": 0.15580929815769196, "learning_rate": 2.2857452545890327e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9566 }, { "completion_length": 561.25, "epoch": 2.6516075388026605, "grad_norm": 0.0, "kl": 0.2199864387512207, "learning_rate": 2.2853090769703663e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9567 }, { "completion_length": 782.25, "epoch": 2.6518847006651884, "grad_norm": 0.0, "kl": 0.15919926762580872, "learning_rate": 2.2848729059353766e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9568 }, { "completion_length": 618.75, "epoch": 2.6521618625277164, "grad_norm": 0.0, "kl": 4.424117088317871, "learning_rate": 2.2844367414974403e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9569 }, { "completion_length": 764.0, "epoch": 2.652439024390244, "grad_norm": 0.0, "kl": 0.31416580080986023, "learning_rate": 2.284000583669933e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9570 }, { "completion_length": 628.75, "epoch": 2.6527161862527717, "grad_norm": 0.0, "kl": 0.20936883985996246, "learning_rate": 2.2835644324662283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9571 }, { "completion_length": 653.25, "epoch": 2.652993348115299, "grad_norm": 0.0, "kl": 0.22759991884231567, "learning_rate": 2.283128287899703e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9572 }, { "completion_length": 693.75, "epoch": 2.653270509977827, "grad_norm": 0.0, "kl": 0.2465737760066986, "learning_rate": 2.2826921499837296e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9573 }, { "completion_length": 664.5, "epoch": 2.6535476718403546, "grad_norm": 0.0, "kl": 0.20877031981945038, "learning_rate": 2.2822560187316858e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9574 }, { "completion_length": 693.25, "epoch": 2.6538248337028825, "grad_norm": 0.7763335704803467, "kl": 1506857868853248.0, "learning_rate": 2.2818198941569424e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9575 }, { "completion_length": 673.0, "epoch": 2.6541019955654104, "grad_norm": 0.0, "kl": 0.23211126029491425, "learning_rate": 2.281383776272876e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9576 }, { "completion_length": 618.75, "epoch": 2.654379157427938, "grad_norm": 0.0, "kl": 0.2000756710767746, "learning_rate": 2.28094766509286e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9577 }, { "completion_length": 622.0, "epoch": 2.6546563192904657, "grad_norm": 0.0, "kl": 0.1702718436717987, "learning_rate": 2.2805115606302675e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9578 }, { "completion_length": 600.0, "epoch": 2.654933481152993, "grad_norm": 0.0, "kl": 0.19721224904060364, "learning_rate": 2.280075462898473e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9579 }, { "completion_length": 781.0, "epoch": 2.655210643015521, "grad_norm": 0.0, "kl": 0.17123815417289734, "learning_rate": 2.2796393719108483e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9580 }, { "completion_length": 655.25, "epoch": 2.6554878048780486, "grad_norm": 0.0, "kl": 471064346624.0, "learning_rate": 2.2792032876807683e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9581 }, { "completion_length": 694.25, "epoch": 2.6557649667405765, "grad_norm": 0.0, "kl": 3287411454902272.0, "learning_rate": 2.2787672102216045e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9582 }, { "completion_length": 636.75, "epoch": 2.6560421286031044, "grad_norm": 0.0, "kl": 0.3164544701576233, "learning_rate": 2.2783311395467304e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9583 }, { "completion_length": 641.0, "epoch": 2.656319290465632, "grad_norm": 0.0, "kl": 0.2237819880247116, "learning_rate": 2.277895075669517e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9584 }, { "completion_length": 600.5, "epoch": 2.6565964523281598, "grad_norm": 0.0, "kl": 18630608.0, "learning_rate": 2.277459018603338e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9585 }, { "completion_length": 559.5, "epoch": 2.6568736141906872, "grad_norm": 0.0, "kl": 0.18644824624061584, "learning_rate": 2.277022968361566e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9586 }, { "completion_length": 700.0, "epoch": 2.657150776053215, "grad_norm": 0.0, "kl": 0.19413204491138458, "learning_rate": 2.2765869249575715e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9587 }, { "completion_length": 645.5, "epoch": 2.6574279379157426, "grad_norm": 0.0, "kl": 0.17907045781612396, "learning_rate": 2.2761508884047272e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9588 }, { "completion_length": 615.75, "epoch": 2.6577050997782705, "grad_norm": 0.0, "kl": 0.17427489161491394, "learning_rate": 2.275714858716403e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9589 }, { "completion_length": 800.5, "epoch": 2.6579822616407984, "grad_norm": 0.0, "kl": 0.1386583000421524, "learning_rate": 2.2752788359059725e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9590 }, { "completion_length": 581.5, "epoch": 2.658259423503326, "grad_norm": 0.0, "kl": 0.2070009410381317, "learning_rate": 2.2748428199868038e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9591 }, { "completion_length": 561.5, "epoch": 2.658536585365854, "grad_norm": 0.0, "kl": 0.22586970031261444, "learning_rate": 2.274406810972269e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9592 }, { "completion_length": 630.0, "epoch": 2.6588137472283813, "grad_norm": 0.0, "kl": 0.2575061619281769, "learning_rate": 2.2739708088757407e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9593 }, { "completion_length": 630.5, "epoch": 2.659090909090909, "grad_norm": 0.0, "kl": 6.161025524139404, "learning_rate": 2.2735348137105856e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9594 }, { "completion_length": 648.75, "epoch": 2.6593680709534366, "grad_norm": 0.0, "kl": 0.20557467639446259, "learning_rate": 2.2730988254901777e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9595 }, { "completion_length": 562.25, "epoch": 2.6596452328159645, "grad_norm": 0.0, "kl": 0.2805255055427551, "learning_rate": 2.272662844227883e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9596 }, { "completion_length": 667.25, "epoch": 2.6599223946784925, "grad_norm": 0.0, "kl": 0.18913628160953522, "learning_rate": 2.272226869937074e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9597 }, { "completion_length": 790.0, "epoch": 2.66019955654102, "grad_norm": 0.0, "kl": 0.18061788380146027, "learning_rate": 2.271790902631119e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9598 }, { "completion_length": 616.25, "epoch": 2.660476718403548, "grad_norm": 0.0, "kl": 633966566572032.0, "learning_rate": 2.2713549423233877e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9599 }, { "completion_length": 671.0, "epoch": 2.6607538802660753, "grad_norm": 0.0, "kl": 0.20283029973506927, "learning_rate": 2.2709189890272498e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9600 }, { "completion_length": 704.0, "epoch": 2.661031042128603, "grad_norm": 0.0, "kl": 0.17332227528095245, "learning_rate": 2.2704830427560733e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9601 }, { "completion_length": 717.5, "epoch": 2.6613082039911307, "grad_norm": 0.0, "kl": 0.20839452743530273, "learning_rate": 2.270047103523227e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9602 }, { "completion_length": 667.0, "epoch": 2.6615853658536586, "grad_norm": 0.0, "kl": 0.18127043545246124, "learning_rate": 2.269611171342079e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9603 }, { "completion_length": 578.5, "epoch": 2.6618625277161865, "grad_norm": 0.0, "kl": 0.17504917085170746, "learning_rate": 2.269175246225999e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9604 }, { "completion_length": 708.0, "epoch": 2.662139689578714, "grad_norm": 0.36271795630455017, "kl": 1.6852047215198208e+16, "learning_rate": 2.2687393281883525e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9605 }, { "completion_length": 664.5, "epoch": 2.662416851441242, "grad_norm": 0.0, "kl": 0.28457900881767273, "learning_rate": 2.2683034172425097e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9606 }, { "completion_length": 620.25, "epoch": 2.6626940133037693, "grad_norm": 0.0, "kl": 0.21241915225982666, "learning_rate": 2.2678675134018376e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9607 }, { "completion_length": 804.5, "epoch": 2.662971175166297, "grad_norm": 0.0, "kl": 7914937.5, "learning_rate": 2.267431616679703e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9608 }, { "completion_length": 666.25, "epoch": 2.6632483370288247, "grad_norm": 0.0, "kl": 0.23983775079250336, "learning_rate": 2.2669957270894733e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9609 }, { "completion_length": 606.0, "epoch": 2.6635254988913526, "grad_norm": 0.0, "kl": 2.3218209743499756, "learning_rate": 2.266559844644515e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9610 }, { "completion_length": 775.0, "epoch": 2.6638026607538805, "grad_norm": 0.0, "kl": 0.1546473205089569, "learning_rate": 2.2661239693581956e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9611 }, { "completion_length": 614.5, "epoch": 2.664079822616408, "grad_norm": 0.0, "kl": 0.19477209448814392, "learning_rate": 2.265688101243881e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9612 }, { "completion_length": 721.0, "epoch": 2.6643569844789354, "grad_norm": 0.0, "kl": 0.18096962571144104, "learning_rate": 2.2652522403149376e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9613 }, { "completion_length": 611.75, "epoch": 2.6646341463414633, "grad_norm": 0.0, "kl": 0.24105672538280487, "learning_rate": 2.2648163865847312e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9614 }, { "completion_length": 636.75, "epoch": 2.6649113082039912, "grad_norm": 0.0, "kl": 0.19361425936222076, "learning_rate": 2.2643805400666276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9615 }, { "completion_length": 737.0, "epoch": 2.6651884700665187, "grad_norm": 0.0, "kl": 0.1520269364118576, "learning_rate": 2.2639447007739933e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9616 }, { "completion_length": 637.0, "epoch": 2.6654656319290466, "grad_norm": 0.0, "kl": 0.31935441493988037, "learning_rate": 2.2635088687201927e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9617 }, { "completion_length": 555.0, "epoch": 2.6657427937915745, "grad_norm": 0.0, "kl": 0.5039243698120117, "learning_rate": 2.263073043918592e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9618 }, { "completion_length": 769.25, "epoch": 2.666019955654102, "grad_norm": 0.0, "kl": 0.16932474076747894, "learning_rate": 2.262637226382555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9619 }, { "completion_length": 597.0, "epoch": 2.6662971175166295, "grad_norm": 0.0, "kl": 0.20916271209716797, "learning_rate": 2.2622014161254475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9620 }, { "completion_length": 610.25, "epoch": 2.6665742793791574, "grad_norm": 0.0, "kl": 0.24466218054294586, "learning_rate": 2.2617656131606324e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9621 }, { "completion_length": 630.25, "epoch": 2.6668514412416853, "grad_norm": 0.0, "kl": 0.4509563744068146, "learning_rate": 2.261329817501475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9622 }, { "completion_length": 572.75, "epoch": 2.6671286031042127, "grad_norm": 0.0, "kl": 4546930671616.0, "learning_rate": 2.26089402916134e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9623 }, { "completion_length": 595.75, "epoch": 2.6674057649667406, "grad_norm": 0.0, "kl": 0.24036835134029388, "learning_rate": 2.260458248153589e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9624 }, { "completion_length": 738.0, "epoch": 2.6676829268292686, "grad_norm": 0.0, "kl": 540061703602176.0, "learning_rate": 2.260022474491589e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9625 }, { "completion_length": 578.0, "epoch": 2.667960088691796, "grad_norm": 0.0, "kl": 0.22036521136760712, "learning_rate": 2.2595867081886995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9626 }, { "completion_length": 734.5, "epoch": 2.6682372505543235, "grad_norm": 2.9658265113830566, "kl": 98431168.0, "learning_rate": 2.2591509492582876e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9627 }, { "completion_length": 546.75, "epoch": 2.6685144124168514, "grad_norm": 0.0, "kl": 0.19271442294120789, "learning_rate": 2.258715197713712e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9628 }, { "completion_length": 613.25, "epoch": 2.6687915742793793, "grad_norm": 0.0, "kl": 0.21205054223537445, "learning_rate": 2.2582794535683387e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9629 }, { "completion_length": 650.5, "epoch": 2.6690687361419068, "grad_norm": 0.0, "kl": 0.190086230635643, "learning_rate": 2.25784371683553e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9630 }, { "completion_length": 546.5, "epoch": 2.6693458980044347, "grad_norm": 0.0, "kl": 0.19898183643817902, "learning_rate": 2.2574079875286456e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9631 }, { "completion_length": 627.75, "epoch": 2.6696230598669626, "grad_norm": 0.0, "kl": 0.19242139160633087, "learning_rate": 2.2569722656610497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9632 }, { "completion_length": 687.75, "epoch": 2.66990022172949, "grad_norm": 0.0, "kl": 0.38946524262428284, "learning_rate": 2.2565365512461033e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9633 }, { "completion_length": 644.25, "epoch": 2.6701773835920175, "grad_norm": 0.0, "kl": 0.6604279279708862, "learning_rate": 2.2561008442971684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9634 }, { "completion_length": 625.5, "epoch": 2.6704545454545454, "grad_norm": 0.0, "kl": 0.1981516182422638, "learning_rate": 2.255665144827606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9635 }, { "completion_length": 668.5, "epoch": 2.6707317073170733, "grad_norm": 0.0, "kl": 0.1906128227710724, "learning_rate": 2.255229452850776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9636 }, { "completion_length": 526.25, "epoch": 2.671008869179601, "grad_norm": 0.0, "kl": 0.2906358242034912, "learning_rate": 2.2547937683800415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9637 }, { "completion_length": 711.5, "epoch": 2.6712860310421287, "grad_norm": 0.0, "kl": 0.1759042739868164, "learning_rate": 2.2543580914287622e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9638 }, { "completion_length": 633.0, "epoch": 2.671563192904656, "grad_norm": 13.595983505249023, "kl": 12541.9521484375, "learning_rate": 2.2539224220102986e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9639 }, { "completion_length": 726.25, "epoch": 2.671840354767184, "grad_norm": 0.0, "kl": 0.1639459878206253, "learning_rate": 2.25348676013801e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9640 }, { "completion_length": 619.75, "epoch": 2.6721175166297115, "grad_norm": 0.0, "kl": 0.33483755588531494, "learning_rate": 2.2530511058252575e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9641 }, { "completion_length": 658.5, "epoch": 2.6723946784922394, "grad_norm": 0.0, "kl": 0.18399012088775635, "learning_rate": 2.2526154590854e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9642 }, { "completion_length": 637.25, "epoch": 2.6726718403547673, "grad_norm": 0.0, "kl": 0.3018074631690979, "learning_rate": 2.2521798199317976e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9643 }, { "completion_length": 676.0, "epoch": 2.672949002217295, "grad_norm": 0.0, "kl": 0.1773204207420349, "learning_rate": 2.2517441883778086e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9644 }, { "completion_length": 704.25, "epoch": 2.6732261640798227, "grad_norm": 0.0, "kl": 1.9191035032272339, "learning_rate": 2.2513085644367925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9645 }, { "completion_length": 599.5, "epoch": 2.67350332594235, "grad_norm": 0.0, "kl": 0.20441554486751556, "learning_rate": 2.2508729481221096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9646 }, { "completion_length": 558.75, "epoch": 2.673780487804878, "grad_norm": 1.9976904392242432, "kl": 208025.921875, "learning_rate": 2.2504373394471153e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9647 }, { "completion_length": 593.5, "epoch": 2.6740576496674056, "grad_norm": 0.0, "kl": 0.22560764849185944, "learning_rate": 2.2500017384251705e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9648 }, { "completion_length": 737.75, "epoch": 2.6743348115299335, "grad_norm": 0.0, "kl": 0.16337478160858154, "learning_rate": 2.249566145069633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9649 }, { "completion_length": 612.5, "epoch": 2.6746119733924614, "grad_norm": 0.39172062277793884, "kl": 0.1958315521478653, "learning_rate": 2.249130559393859e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9650 }, { "completion_length": 693.75, "epoch": 2.674889135254989, "grad_norm": 0.0, "kl": 0.18467393517494202, "learning_rate": 2.248694981411208e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9651 }, { "completion_length": 652.75, "epoch": 2.6751662971175167, "grad_norm": 0.0, "kl": 0.1922519952058792, "learning_rate": 2.2482594111350357e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9652 }, { "completion_length": 656.75, "epoch": 2.675443458980044, "grad_norm": 0.0, "kl": 0.22920583188533783, "learning_rate": 2.2478238485787005e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9653 }, { "completion_length": 693.0, "epoch": 2.675720620842572, "grad_norm": 0.0, "kl": 0.21841546893119812, "learning_rate": 2.247388293755559e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9654 }, { "completion_length": 556.5, "epoch": 2.6759977827050996, "grad_norm": 0.0, "kl": 0.2883727550506592, "learning_rate": 2.2469527466789674e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9655 }, { "completion_length": 671.5, "epoch": 2.6762749445676275, "grad_norm": 0.0, "kl": 0.2189818173646927, "learning_rate": 2.246517207362282e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9656 }, { "completion_length": 574.5, "epoch": 2.6765521064301554, "grad_norm": 0.0, "kl": 0.2073952704668045, "learning_rate": 2.2460816758188607e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9657 }, { "completion_length": 600.0, "epoch": 2.676829268292683, "grad_norm": 0.0, "kl": 0.19137582182884216, "learning_rate": 2.2456461520620565e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9658 }, { "completion_length": 671.5, "epoch": 2.6771064301552108, "grad_norm": 0.0, "kl": 0.43034568428993225, "learning_rate": 2.2452106361052275e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9659 }, { "completion_length": 672.75, "epoch": 2.6773835920177382, "grad_norm": 0.0, "kl": 0.23673446476459503, "learning_rate": 2.2447751279617286e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9660 }, { "completion_length": 707.25, "epoch": 2.677660753880266, "grad_norm": 0.0, "kl": 0.2065226435661316, "learning_rate": 2.2443396276449145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9661 }, { "completion_length": 670.75, "epoch": 2.6779379157427936, "grad_norm": 0.0, "kl": 0.38868141174316406, "learning_rate": 2.2439041351681412e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9662 }, { "completion_length": 602.75, "epoch": 2.6782150776053215, "grad_norm": 0.0, "kl": 12458203.0, "learning_rate": 2.243468650544762e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9663 }, { "completion_length": 625.5, "epoch": 2.6784922394678494, "grad_norm": 0.0, "kl": 0.21481920778751373, "learning_rate": 2.2430331737881327e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9664 }, { "completion_length": 697.5, "epoch": 2.678769401330377, "grad_norm": 0.0, "kl": 0.274783730506897, "learning_rate": 2.242597704911606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9665 }, { "completion_length": 734.75, "epoch": 2.679046563192905, "grad_norm": 0.0, "kl": 297.5539245605469, "learning_rate": 2.242162243928537e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9666 }, { "completion_length": 685.75, "epoch": 2.6793237250554323, "grad_norm": 0.0, "kl": 0.47486060857772827, "learning_rate": 2.2417267908522804e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9667 }, { "completion_length": 605.75, "epoch": 2.67960088691796, "grad_norm": 0.0, "kl": 11.335119247436523, "learning_rate": 2.241291345696188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9668 }, { "completion_length": 573.25, "epoch": 2.6798780487804876, "grad_norm": 0.0, "kl": 0.1941651552915573, "learning_rate": 2.2408559084736144e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9669 }, { "completion_length": 605.0, "epoch": 2.6801552106430155, "grad_norm": 0.0, "kl": 0.1792079508304596, "learning_rate": 2.240420479197912e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9670 }, { "completion_length": 591.5, "epoch": 2.6804323725055434, "grad_norm": 0.0, "kl": 0.19151370227336884, "learning_rate": 2.2399850578824335e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9671 }, { "completion_length": 576.75, "epoch": 2.680709534368071, "grad_norm": 0.0, "kl": 1547391.375, "learning_rate": 2.2395496445405315e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9672 }, { "completion_length": 607.25, "epoch": 2.680986696230599, "grad_norm": 0.0, "kl": 0.24787762761116028, "learning_rate": 2.239114239185559e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9673 }, { "completion_length": 581.75, "epoch": 2.6812638580931263, "grad_norm": 0.0, "kl": 0.2731468975543976, "learning_rate": 2.238678841830867e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9674 }, { "completion_length": 638.75, "epoch": 2.681541019955654, "grad_norm": 0.0, "kl": 0.20346882939338684, "learning_rate": 2.238243452489807e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9675 }, { "completion_length": 722.5, "epoch": 2.6818181818181817, "grad_norm": 0.0, "kl": 0.3132617473602295, "learning_rate": 2.2378080711757332e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9676 }, { "completion_length": 678.75, "epoch": 2.6820953436807096, "grad_norm": 0.0, "kl": 0.19931411743164062, "learning_rate": 2.237372697901994e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9677 }, { "completion_length": 705.25, "epoch": 2.6823725055432375, "grad_norm": 0.0, "kl": 0.22134730219841003, "learning_rate": 2.236937332681943e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9678 }, { "completion_length": 763.0, "epoch": 2.682649667405765, "grad_norm": 3.4170615673065186, "kl": 102257049600.0, "learning_rate": 2.2365019755289274e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9679 }, { "completion_length": 638.25, "epoch": 2.682926829268293, "grad_norm": 0.0, "kl": 259565200.0, "learning_rate": 2.236066626456302e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9680 }, { "completion_length": 612.0, "epoch": 2.6832039911308203, "grad_norm": 0.0, "kl": 3906229504.0, "learning_rate": 2.235631285477414e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9681 }, { "completion_length": 716.25, "epoch": 2.683481152993348, "grad_norm": 0.0, "kl": 0.5346553921699524, "learning_rate": 2.2351959526056154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9682 }, { "completion_length": 715.0, "epoch": 2.6837583148558757, "grad_norm": 0.0, "kl": 354823776.0, "learning_rate": 2.234760627854256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9683 }, { "completion_length": 643.0, "epoch": 2.6840354767184036, "grad_norm": 0.0, "kl": 0.1949184238910675, "learning_rate": 2.2343253112366834e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9684 }, { "completion_length": 602.25, "epoch": 2.6843126385809315, "grad_norm": 0.0, "kl": 0.19515454769134521, "learning_rate": 2.2338900027662496e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9685 }, { "completion_length": 776.0, "epoch": 2.684589800443459, "grad_norm": 0.0, "kl": 9.529095649719238, "learning_rate": 2.233454702456301e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9686 }, { "completion_length": 604.75, "epoch": 2.6848669623059864, "grad_norm": 0.0, "kl": 0.24242736399173737, "learning_rate": 2.23301941032019e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9687 }, { "completion_length": 766.5, "epoch": 2.6851441241685143, "grad_norm": 0.0, "kl": 0.18490637838840485, "learning_rate": 2.2325841263712607e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9688 }, { "completion_length": 619.5, "epoch": 2.6854212860310422, "grad_norm": 0.0, "kl": 36482656.0, "learning_rate": 2.2321488506228646e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9689 }, { "completion_length": 654.75, "epoch": 2.6856984478935697, "grad_norm": 0.0, "kl": 0.2376275658607483, "learning_rate": 2.2317135830883498e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9690 }, { "completion_length": 657.25, "epoch": 2.6859756097560976, "grad_norm": 0.4236834645271301, "kl": 0.18919619917869568, "learning_rate": 2.2312783237810626e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9691 }, { "completion_length": 696.0, "epoch": 2.6862527716186255, "grad_norm": 0.0, "kl": 1.2965511083602905, "learning_rate": 2.2308430727143523e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9692 }, { "completion_length": 609.5, "epoch": 2.686529933481153, "grad_norm": 0.0, "kl": 0.19052955508232117, "learning_rate": 2.2304078299015643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9693 }, { "completion_length": 636.5, "epoch": 2.6868070953436805, "grad_norm": 0.0, "kl": 0.2864357829093933, "learning_rate": 2.2299725953560474e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9694 }, { "completion_length": 732.75, "epoch": 2.6870842572062084, "grad_norm": 0.0, "kl": 0.38390257954597473, "learning_rate": 2.229537369091147e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9695 }, { "completion_length": 572.25, "epoch": 2.6873614190687363, "grad_norm": 0.0, "kl": 1.111053705215454, "learning_rate": 2.22910215112021e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9696 }, { "completion_length": 590.25, "epoch": 2.6876385809312637, "grad_norm": 0.0, "kl": 0.17366115748882294, "learning_rate": 2.228666941456585e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9697 }, { "completion_length": 694.5, "epoch": 2.6879157427937916, "grad_norm": 0.0, "kl": 0.18246044218540192, "learning_rate": 2.2282317401136143e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9698 }, { "completion_length": 648.25, "epoch": 2.6881929046563195, "grad_norm": 0.0, "kl": 0.18489427864551544, "learning_rate": 2.2277965471046466e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9699 }, { "completion_length": 617.0, "epoch": 2.688470066518847, "grad_norm": 0.0, "kl": 0.23867788910865784, "learning_rate": 2.2273613624430256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9700 }, { "completion_length": 642.5, "epoch": 2.6887472283813745, "grad_norm": 0.0, "kl": 274077.09375, "learning_rate": 2.226926186142099e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9701 }, { "completion_length": 590.0, "epoch": 2.6890243902439024, "grad_norm": 0.0, "kl": 0.373838871717453, "learning_rate": 2.226491018215209e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9702 }, { "completion_length": 704.75, "epoch": 2.6893015521064303, "grad_norm": 0.3619234561920166, "kl": 162023984.0, "learning_rate": 2.226055858675702e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9703 }, { "completion_length": 670.25, "epoch": 2.6895787139689578, "grad_norm": 0.4114542305469513, "kl": 2202366208.0, "learning_rate": 2.2256207075369225e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9704 }, { "completion_length": 580.25, "epoch": 2.6898558758314857, "grad_norm": 0.0, "kl": 0.21516357362270355, "learning_rate": 2.225185564812214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9705 }, { "completion_length": 619.25, "epoch": 2.6901330376940136, "grad_norm": 0.0, "kl": 0.19392871856689453, "learning_rate": 2.2247504305149217e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9706 }, { "completion_length": 687.75, "epoch": 2.690410199556541, "grad_norm": 0.0, "kl": 0.20091445744037628, "learning_rate": 2.2243153046583878e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9707 }, { "completion_length": 638.25, "epoch": 2.6906873614190685, "grad_norm": 0.0, "kl": 0.16229252517223358, "learning_rate": 2.223880187255958e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9708 }, { "completion_length": 565.75, "epoch": 2.6909645232815964, "grad_norm": 0.0, "kl": 0.1935535967350006, "learning_rate": 2.2234450783209726e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9709 }, { "completion_length": 669.25, "epoch": 2.6912416851441243, "grad_norm": 0.0, "kl": 5.7644195556640625, "learning_rate": 2.2230099778667775e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9710 }, { "completion_length": 677.75, "epoch": 2.691518847006652, "grad_norm": 0.0, "kl": 2065447.125, "learning_rate": 2.2225748859067136e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9711 }, { "completion_length": 593.75, "epoch": 2.6917960088691797, "grad_norm": 0.0, "kl": 0.39923563599586487, "learning_rate": 2.2221398024541236e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9712 }, { "completion_length": 726.5, "epoch": 2.692073170731707, "grad_norm": 0.0, "kl": 6544276028653568.0, "learning_rate": 2.221704727522351e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9713 }, { "completion_length": 704.0, "epoch": 2.692350332594235, "grad_norm": 0.0, "kl": 0.19216211140155792, "learning_rate": 2.2212696611247355e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9714 }, { "completion_length": 619.75, "epoch": 2.6926274944567625, "grad_norm": 0.0, "kl": 0.9297257661819458, "learning_rate": 2.220834603274621e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9715 }, { "completion_length": 692.25, "epoch": 2.6929046563192904, "grad_norm": 0.0, "kl": 131.53201293945312, "learning_rate": 2.2203995539853474e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9716 }, { "completion_length": 643.75, "epoch": 2.6931818181818183, "grad_norm": 0.0, "kl": 6.62764835357666, "learning_rate": 2.2199645132702567e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9717 }, { "completion_length": 657.75, "epoch": 2.693458980044346, "grad_norm": 0.535253643989563, "kl": 0.21412956714630127, "learning_rate": 2.219529481142689e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9718 }, { "completion_length": 514.25, "epoch": 2.6937361419068737, "grad_norm": 0.0, "kl": 0.2384302020072937, "learning_rate": 2.219094457615985e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9719 }, { "completion_length": 609.5, "epoch": 2.694013303769401, "grad_norm": 0.0, "kl": 0.19180385768413544, "learning_rate": 2.2186594427034868e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9720 }, { "completion_length": 596.75, "epoch": 2.694290465631929, "grad_norm": 0.0, "kl": 0.2091415822505951, "learning_rate": 2.2182244364185316e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9721 }, { "completion_length": 720.25, "epoch": 2.6945676274944566, "grad_norm": 2.8457388877868652, "kl": 13107239936.0, "learning_rate": 2.2177894387744623e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9722 }, { "completion_length": 512.5, "epoch": 2.6948447893569845, "grad_norm": 0.0, "kl": 1328922496.0, "learning_rate": 2.217354449784616e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9723 }, { "completion_length": 468.75, "epoch": 2.6951219512195124, "grad_norm": 0.0, "kl": 0.2684137225151062, "learning_rate": 2.2169194694623336e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9724 }, { "completion_length": 794.25, "epoch": 2.69539911308204, "grad_norm": 1.4882004261016846, "kl": 7340231.5, "learning_rate": 2.216484497820953e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9725 }, { "completion_length": 599.75, "epoch": 2.6956762749445677, "grad_norm": 0.0, "kl": 0.19176490604877472, "learning_rate": 2.2160495348738127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9726 }, { "completion_length": 657.25, "epoch": 2.695953436807095, "grad_norm": 0.0, "kl": 0.17179682850837708, "learning_rate": 2.2156145806342536e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9727 }, { "completion_length": 595.25, "epoch": 2.696230598669623, "grad_norm": 0.0, "kl": 0.23978842794895172, "learning_rate": 2.2151796351156106e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9728 }, { "completion_length": 621.75, "epoch": 2.6965077605321506, "grad_norm": 0.4241170585155487, "kl": 1497753728.0, "learning_rate": 2.2147446983312253e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9729 }, { "completion_length": 709.75, "epoch": 2.6967849223946785, "grad_norm": 0.0, "kl": 0.1979954093694687, "learning_rate": 2.214309770294432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9730 }, { "completion_length": 637.75, "epoch": 2.6970620842572064, "grad_norm": 0.0, "kl": 0.20307135581970215, "learning_rate": 2.21387485101857e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9731 }, { "completion_length": 535.75, "epoch": 2.697339246119734, "grad_norm": 0.0, "kl": 0.24268437922000885, "learning_rate": 2.213439940516976e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9732 }, { "completion_length": 571.5, "epoch": 2.6976164079822618, "grad_norm": 0.0, "kl": 0.5563602447509766, "learning_rate": 2.213005038802987e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9733 }, { "completion_length": 654.0, "epoch": 2.6978935698447892, "grad_norm": 0.0, "kl": 0.17892484366893768, "learning_rate": 2.21257014588994e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9734 }, { "completion_length": 663.25, "epoch": 2.698170731707317, "grad_norm": 0.0, "kl": 0.24252256751060486, "learning_rate": 2.2121352617911706e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9735 }, { "completion_length": 702.25, "epoch": 2.6984478935698446, "grad_norm": 0.348065584897995, "kl": 7277782891495424.0, "learning_rate": 2.2117003865200154e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9736 }, { "completion_length": 639.0, "epoch": 2.6987250554323725, "grad_norm": 0.0, "kl": 0.21320360898971558, "learning_rate": 2.2112655200898094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9737 }, { "completion_length": 686.5, "epoch": 2.6990022172949004, "grad_norm": 0.0, "kl": 0.26417481899261475, "learning_rate": 2.21083066251389e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9738 }, { "completion_length": 733.5, "epoch": 2.699279379157428, "grad_norm": 0.0, "kl": 0.3216487169265747, "learning_rate": 2.2103958138055897e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9739 }, { "completion_length": 660.0, "epoch": 2.699556541019956, "grad_norm": 0.4063315987586975, "kl": 387482252214272.0, "learning_rate": 2.2099609739782466e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9740 }, { "completion_length": 738.5, "epoch": 2.6998337028824833, "grad_norm": 0.0, "kl": 8931453.0, "learning_rate": 2.2095261430451924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9741 }, { "completion_length": 620.75, "epoch": 2.700110864745011, "grad_norm": 0.0, "kl": 0.22977536916732788, "learning_rate": 2.209091321019764e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9742 }, { "completion_length": 688.0, "epoch": 2.7003880266075386, "grad_norm": 0.0, "kl": 0.18244190514087677, "learning_rate": 2.2086565079152945e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9743 }, { "completion_length": 734.75, "epoch": 2.7006651884700665, "grad_norm": 0.0, "kl": 0.1889231652021408, "learning_rate": 2.2082217037451183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9744 }, { "completion_length": 702.75, "epoch": 2.7009423503325944, "grad_norm": 0.0, "kl": 0.2032703161239624, "learning_rate": 2.2077869085225683e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9745 }, { "completion_length": 590.0, "epoch": 2.701219512195122, "grad_norm": 0.0, "kl": 0.18920321762561798, "learning_rate": 2.207352122260978e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9746 }, { "completion_length": 573.75, "epoch": 2.70149667405765, "grad_norm": 0.0, "kl": 0.19229283928871155, "learning_rate": 2.2069173449736815e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9747 }, { "completion_length": 572.25, "epoch": 2.7017738359201773, "grad_norm": 0.4641747772693634, "kl": 0.20260167121887207, "learning_rate": 2.20648257667401e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9748 }, { "completion_length": 635.75, "epoch": 2.702050997782705, "grad_norm": 0.0, "kl": 20.134620666503906, "learning_rate": 2.2060478173752964e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9749 }, { "completion_length": 552.75, "epoch": 2.7023281596452327, "grad_norm": 0.0, "kl": 0.20694124698638916, "learning_rate": 2.2056130670908745e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9750 }, { "completion_length": 589.25, "epoch": 2.7026053215077606, "grad_norm": 0.0, "kl": 0.20085427165031433, "learning_rate": 2.205178325834075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9751 }, { "completion_length": 626.5, "epoch": 2.7028824833702885, "grad_norm": 0.0, "kl": 0.22643828392028809, "learning_rate": 2.20474359361823e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9752 }, { "completion_length": 575.25, "epoch": 2.703159645232816, "grad_norm": 0.0, "kl": 0.2454988807439804, "learning_rate": 2.20430887045667e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9753 }, { "completion_length": 628.0, "epoch": 2.703436807095344, "grad_norm": 0.0, "kl": 0.1873091608285904, "learning_rate": 2.203874156362728e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9754 }, { "completion_length": 586.5, "epoch": 2.7037139689578713, "grad_norm": 0.0, "kl": 22192199680.0, "learning_rate": 2.203439451349733e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9755 }, { "completion_length": 646.75, "epoch": 2.703991130820399, "grad_norm": 0.4261699616909027, "kl": 0.225080668926239, "learning_rate": 2.2030047554310156e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9756 }, { "completion_length": 540.75, "epoch": 2.7042682926829267, "grad_norm": 0.0, "kl": 0.18346385657787323, "learning_rate": 2.202570068619909e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9757 }, { "completion_length": 616.25, "epoch": 2.7045454545454546, "grad_norm": 0.0, "kl": 0.2591593265533447, "learning_rate": 2.2021353909297393e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9758 }, { "completion_length": 641.75, "epoch": 2.7048226164079825, "grad_norm": 0.0, "kl": 1803435292753920.0, "learning_rate": 2.2017007223738396e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9759 }, { "completion_length": 559.75, "epoch": 2.70509977827051, "grad_norm": 0.0, "kl": 0.25283122062683105, "learning_rate": 2.201266062965536e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9760 }, { "completion_length": 576.75, "epoch": 2.7053769401330374, "grad_norm": 0.0, "kl": 2.487175226211548, "learning_rate": 2.200831412718161e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9761 }, { "completion_length": 556.0, "epoch": 2.7056541019955653, "grad_norm": 0.0, "kl": 0.4071854054927826, "learning_rate": 2.2003967716450414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9762 }, { "completion_length": 671.0, "epoch": 2.7059312638580932, "grad_norm": 0.0, "kl": 0.215755894780159, "learning_rate": 2.1999621397595066e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9763 }, { "completion_length": 701.0, "epoch": 2.7062084257206207, "grad_norm": 0.5221542716026306, "kl": 1955707648.0, "learning_rate": 2.199527517074885e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9764 }, { "completion_length": 613.75, "epoch": 2.7064855875831486, "grad_norm": 0.0, "kl": 0.1943323314189911, "learning_rate": 2.199092903604504e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9765 }, { "completion_length": 605.5, "epoch": 2.7067627494456765, "grad_norm": 0.0, "kl": 8214930432.0, "learning_rate": 2.1986582993616926e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9766 }, { "completion_length": 629.25, "epoch": 2.707039911308204, "grad_norm": 0.3886861801147461, "kl": 0.209668830037117, "learning_rate": 2.1982237043597777e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9767 }, { "completion_length": 697.0, "epoch": 2.7073170731707314, "grad_norm": 0.0, "kl": 2491683438592.0, "learning_rate": 2.197789118612086e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9768 }, { "completion_length": 624.25, "epoch": 2.7075942350332594, "grad_norm": 0.5610248446464539, "kl": 0.1697615683078766, "learning_rate": 2.1973545421319444e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9769 }, { "completion_length": 560.5, "epoch": 2.7078713968957873, "grad_norm": 0.0, "kl": 0.2866775095462799, "learning_rate": 2.1969199749326814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9770 }, { "completion_length": 728.5, "epoch": 2.7081485587583147, "grad_norm": 0.0, "kl": 0.30554336309432983, "learning_rate": 2.1964854170276205e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9771 }, { "completion_length": 675.25, "epoch": 2.7084257206208426, "grad_norm": 0.0, "kl": 0.19871409237384796, "learning_rate": 2.1960508684300896e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9772 }, { "completion_length": 660.75, "epoch": 2.7087028824833705, "grad_norm": 0.0, "kl": 0.18076537549495697, "learning_rate": 2.195616329153415e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9773 }, { "completion_length": 609.25, "epoch": 2.708980044345898, "grad_norm": 0.0, "kl": 0.29207706451416016, "learning_rate": 2.1951817992109204e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9774 }, { "completion_length": 666.5, "epoch": 2.7092572062084255, "grad_norm": 0.0, "kl": 0.1952819526195526, "learning_rate": 2.194747278615933e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9775 }, { "completion_length": 633.75, "epoch": 2.7095343680709534, "grad_norm": 0.0, "kl": 0.19374792277812958, "learning_rate": 2.194312767381776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9776 }, { "completion_length": 628.5, "epoch": 2.7098115299334813, "grad_norm": 0.0, "kl": 0.18488071858882904, "learning_rate": 2.1938782655217754e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9777 }, { "completion_length": 662.25, "epoch": 2.7100886917960088, "grad_norm": 0.0, "kl": 0.1911991834640503, "learning_rate": 2.1934437730492544e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9778 }, { "completion_length": 749.5, "epoch": 2.7103658536585367, "grad_norm": 1.83527410030365, "kl": 58364360.0, "learning_rate": 2.193009289977537e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9779 }, { "completion_length": 617.25, "epoch": 2.7106430155210646, "grad_norm": 0.0, "kl": 1.2113689184188843, "learning_rate": 2.1925748163199494e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9780 }, { "completion_length": 593.25, "epoch": 2.710920177383592, "grad_norm": 9.16695785522461, "kl": 1818080256.0, "learning_rate": 2.1921403520898117e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9781 }, { "completion_length": 625.75, "epoch": 2.7111973392461195, "grad_norm": 0.0, "kl": 8362.26953125, "learning_rate": 2.1917058973004496e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9782 }, { "completion_length": 665.25, "epoch": 2.7114745011086474, "grad_norm": 0.0, "kl": 25406610604032.0, "learning_rate": 2.1912714519651847e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9783 }, { "completion_length": 595.25, "epoch": 2.7117516629711753, "grad_norm": 0.0, "kl": 0.24036014080047607, "learning_rate": 2.190837016097341e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9784 }, { "completion_length": 613.5, "epoch": 2.712028824833703, "grad_norm": 0.0, "kl": 0.2853570282459259, "learning_rate": 2.1904025897102386e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9785 }, { "completion_length": 556.0, "epoch": 2.7123059866962307, "grad_norm": 0.0, "kl": 5480972288.0, "learning_rate": 2.1899681728172017e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9786 }, { "completion_length": 603.0, "epoch": 2.712583148558758, "grad_norm": 4.308470249176025, "kl": 28.551664352416992, "learning_rate": 2.1895337654315514e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9787 }, { "completion_length": 586.75, "epoch": 2.712860310421286, "grad_norm": 0.0, "kl": 77.45870971679688, "learning_rate": 2.1890993675666077e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9788 }, { "completion_length": 575.25, "epoch": 2.7131374722838135, "grad_norm": 0.0, "kl": 0.18271999061107635, "learning_rate": 2.188664979235695e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9789 }, { "completion_length": 558.25, "epoch": 2.7134146341463414, "grad_norm": 0.0, "kl": 12493974528.0, "learning_rate": 2.188230600452131e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9790 }, { "completion_length": 628.0, "epoch": 2.7136917960088693, "grad_norm": 0.0, "kl": 2871270.5, "learning_rate": 2.1877962312292385e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9791 }, { "completion_length": 581.0, "epoch": 2.713968957871397, "grad_norm": 3.507192373275757, "kl": 40417.26171875, "learning_rate": 2.1873618715803356e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9792 }, { "completion_length": 626.75, "epoch": 2.7142461197339247, "grad_norm": 0.4189567565917969, "kl": 1.3749906153930752e+16, "learning_rate": 2.186927521518744e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9793 }, { "completion_length": 618.25, "epoch": 2.714523281596452, "grad_norm": 0.0, "kl": 0.19103679060935974, "learning_rate": 2.1864931810577836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9794 }, { "completion_length": 583.0, "epoch": 2.71480044345898, "grad_norm": 0.0, "kl": 0.2181239128112793, "learning_rate": 2.1860588502107724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9795 }, { "completion_length": 714.0, "epoch": 2.7150776053215075, "grad_norm": 0.342017263174057, "kl": 17489002496.0, "learning_rate": 2.185624528991031e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9796 }, { "completion_length": 743.75, "epoch": 2.7153547671840355, "grad_norm": 0.0, "kl": 0.18278200924396515, "learning_rate": 2.1851902174118772e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9797 }, { "completion_length": 595.0, "epoch": 2.7156319290465634, "grad_norm": 0.0, "kl": 0.20146572589874268, "learning_rate": 2.1847559154866303e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9798 }, { "completion_length": 660.5, "epoch": 2.715909090909091, "grad_norm": 3.0152089595794678, "kl": 563656130560.0, "learning_rate": 2.1843216232286076e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9799 }, { "completion_length": 593.5, "epoch": 2.7161862527716187, "grad_norm": 0.0, "kl": 0.2635369598865509, "learning_rate": 2.1838873406511267e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9800 }, { "completion_length": 602.0, "epoch": 2.716463414634146, "grad_norm": 0.0, "kl": 0.3381001949310303, "learning_rate": 2.183453067767507e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9801 }, { "completion_length": 630.5, "epoch": 2.716740576496674, "grad_norm": 0.0, "kl": 0.38332709670066833, "learning_rate": 2.183018804591065e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9802 }, { "completion_length": 929.0, "epoch": 2.7170177383592016, "grad_norm": 4.133391380310059, "kl": 4250230016.0, "learning_rate": 2.1825845511351175e-06, "loss": 0.0, "reward": 3.5625, "reward_std": 2.11517333984375, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5625, "step": 9803 }, { "completion_length": 606.75, "epoch": 2.7172949002217295, "grad_norm": 0.0, "kl": 0.20336811244487762, "learning_rate": 2.1821503074129814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9804 }, { "completion_length": 685.0, "epoch": 2.7175720620842574, "grad_norm": 0.0, "kl": 0.18275408446788788, "learning_rate": 2.181716073437973e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9805 }, { "completion_length": 683.75, "epoch": 2.717849223946785, "grad_norm": 0.0, "kl": 897127601405952.0, "learning_rate": 2.181281849223409e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9806 }, { "completion_length": 664.25, "epoch": 2.7181263858093128, "grad_norm": 0.0, "kl": 0.1654876321554184, "learning_rate": 2.180847634782605e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9807 }, { "completion_length": 621.75, "epoch": 2.7184035476718402, "grad_norm": 0.0, "kl": 0.27605584263801575, "learning_rate": 2.1804134301288758e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9808 }, { "completion_length": 620.0, "epoch": 2.718680709534368, "grad_norm": 0.0, "kl": 0.22304750978946686, "learning_rate": 2.1799792352755365e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9809 }, { "completion_length": 739.5, "epoch": 2.7189578713968956, "grad_norm": 0.0, "kl": 0.17035692930221558, "learning_rate": 2.1795450502359044e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9810 }, { "completion_length": 625.75, "epoch": 2.7192350332594235, "grad_norm": 0.0, "kl": 0.1999121755361557, "learning_rate": 2.1791108750232915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9811 }, { "completion_length": 729.75, "epoch": 2.7195121951219514, "grad_norm": 0.0, "kl": 0.18768218159675598, "learning_rate": 2.178676709651014e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9812 }, { "completion_length": 641.25, "epoch": 2.719789356984479, "grad_norm": 0.0, "kl": 0.25610899925231934, "learning_rate": 2.178242554132384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9813 }, { "completion_length": 592.75, "epoch": 2.720066518847007, "grad_norm": 0.0, "kl": 4.095343112945557, "learning_rate": 2.1778084084807176e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9814 }, { "completion_length": 706.5, "epoch": 2.7203436807095343, "grad_norm": 0.0, "kl": 0.15458500385284424, "learning_rate": 2.1773742727093265e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9815 }, { "completion_length": 758.5, "epoch": 2.720620842572062, "grad_norm": 0.0, "kl": 7.451502799987793, "learning_rate": 2.176940146831524e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9816 }, { "completion_length": 581.5, "epoch": 2.7208980044345896, "grad_norm": 0.0, "kl": 24.63617706298828, "learning_rate": 2.1765060308606243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9817 }, { "completion_length": 604.0, "epoch": 2.7211751662971175, "grad_norm": 0.0, "kl": 0.18741801381111145, "learning_rate": 2.1760719248099384e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9818 }, { "completion_length": 643.75, "epoch": 2.7214523281596454, "grad_norm": 0.0, "kl": 0.17995938658714294, "learning_rate": 2.1756378286927793e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9819 }, { "completion_length": 562.75, "epoch": 2.721729490022173, "grad_norm": 0.0, "kl": 0.20539134740829468, "learning_rate": 2.1752037425224577e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9820 }, { "completion_length": 669.0, "epoch": 2.722006651884701, "grad_norm": 0.0, "kl": 0.3156002163887024, "learning_rate": 2.1747696663122873e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9821 }, { "completion_length": 746.25, "epoch": 2.7222838137472283, "grad_norm": 0.0, "kl": 0.14118410646915436, "learning_rate": 2.1743356000755774e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9822 }, { "completion_length": 805.75, "epoch": 2.722560975609756, "grad_norm": 0.0, "kl": 0.15433232486248016, "learning_rate": 2.17390154382564e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9823 }, { "completion_length": 571.75, "epoch": 2.7228381374722836, "grad_norm": 0.0, "kl": 0.34409818053245544, "learning_rate": 2.1734674975757865e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9824 }, { "completion_length": 656.0, "epoch": 2.7231152993348116, "grad_norm": 0.0, "kl": 0.21245750784873962, "learning_rate": 2.173033461339326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9825 }, { "completion_length": 669.0, "epoch": 2.7233924611973395, "grad_norm": 1.291931390762329, "kl": 2.966423749923706, "learning_rate": 2.1725994351295697e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9826 }, { "completion_length": 806.75, "epoch": 2.723669623059867, "grad_norm": 0.0, "kl": 0.13986143469810486, "learning_rate": 2.1721654189598262e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9827 }, { "completion_length": 704.5, "epoch": 2.723946784922395, "grad_norm": 0.0, "kl": 0.18039017915725708, "learning_rate": 2.171731412843406e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9828 }, { "completion_length": 601.0, "epoch": 2.7242239467849223, "grad_norm": 0.4054972231388092, "kl": 6941877257371648.0, "learning_rate": 2.171297416793617e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9829 }, { "completion_length": 592.5, "epoch": 2.72450110864745, "grad_norm": 0.0, "kl": 0.17146775126457214, "learning_rate": 2.1708634308237687e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9830 }, { "completion_length": 694.25, "epoch": 2.7247782705099777, "grad_norm": 0.0, "kl": 0.15578950941562653, "learning_rate": 2.170429454947171e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9831 }, { "completion_length": 817.5, "epoch": 2.7250554323725056, "grad_norm": 0.0, "kl": 0.1589689999818802, "learning_rate": 2.16999548917713e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9832 }, { "completion_length": 636.0, "epoch": 2.7253325942350335, "grad_norm": 0.0, "kl": 0.212014302611351, "learning_rate": 2.1695615335269554e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9833 }, { "completion_length": 615.5, "epoch": 2.725609756097561, "grad_norm": 0.0, "kl": 0.170410618185997, "learning_rate": 2.169127588009953e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9834 }, { "completion_length": 563.0, "epoch": 2.7258869179600884, "grad_norm": 0.0, "kl": 96687160.0, "learning_rate": 2.168693652639432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9835 }, { "completion_length": 615.0, "epoch": 2.7261640798226163, "grad_norm": 0.0, "kl": 148626292736.0, "learning_rate": 2.168259727428698e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9836 }, { "completion_length": 718.5, "epoch": 2.7264412416851442, "grad_norm": 0.0, "kl": 18145936932864.0, "learning_rate": 2.1678258123910584e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9837 }, { "completion_length": 654.25, "epoch": 2.7267184035476717, "grad_norm": 0.0, "kl": 0.18717394769191742, "learning_rate": 2.167391907539819e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9838 }, { "completion_length": 617.5, "epoch": 2.7269955654101996, "grad_norm": 0.0, "kl": 0.19474461674690247, "learning_rate": 2.1669580128882855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9839 }, { "completion_length": 573.5, "epoch": 2.7272727272727275, "grad_norm": 3.6033480167388916, "kl": 1467136640.0, "learning_rate": 2.166524128449766e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9840 }, { "completion_length": 720.25, "epoch": 2.727549889135255, "grad_norm": 0.0, "kl": 0.2384662926197052, "learning_rate": 2.1660902542375626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9841 }, { "completion_length": 608.0, "epoch": 2.7278270509977824, "grad_norm": 0.0, "kl": 7.300070762634277, "learning_rate": 2.165656390264984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9842 }, { "completion_length": 641.25, "epoch": 2.7281042128603104, "grad_norm": 0.0, "kl": 0.41730132699012756, "learning_rate": 2.165222536545331e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9843 }, { "completion_length": 598.5, "epoch": 2.7283813747228383, "grad_norm": 0.0, "kl": 0.24496997892856598, "learning_rate": 2.164788693091911e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9844 }, { "completion_length": 606.0, "epoch": 2.7286585365853657, "grad_norm": 0.0, "kl": 0.23179246485233307, "learning_rate": 2.1643548599180277e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9845 }, { "completion_length": 664.5, "epoch": 2.7289356984478936, "grad_norm": 0.0, "kl": 0.1720484346151352, "learning_rate": 2.163921037036984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9846 }, { "completion_length": 607.75, "epoch": 2.7292128603104215, "grad_norm": 0.0, "kl": 0.18737414479255676, "learning_rate": 2.163487224462085e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9847 }, { "completion_length": 611.0, "epoch": 2.729490022172949, "grad_norm": 0.0, "kl": 0.18113572895526886, "learning_rate": 2.163053422206632e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9848 }, { "completion_length": 664.75, "epoch": 2.7297671840354765, "grad_norm": 0.0, "kl": 0.1740877479314804, "learning_rate": 2.1626196302839294e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9849 }, { "completion_length": 665.75, "epoch": 2.7300443458980044, "grad_norm": 0.33277806639671326, "kl": 673720768.0, "learning_rate": 2.162185848707279e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9850 }, { "completion_length": 659.5, "epoch": 2.7303215077605323, "grad_norm": 0.0, "kl": 2395053.5, "learning_rate": 2.1617520774899835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9851 }, { "completion_length": 598.5, "epoch": 2.7305986696230597, "grad_norm": 0.0, "kl": 0.1880893111228943, "learning_rate": 2.161318316645344e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9852 }, { "completion_length": 583.0, "epoch": 2.7308758314855877, "grad_norm": 0.44315090775489807, "kl": 0.18972696363925934, "learning_rate": 2.1608845661866632e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9853 }, { "completion_length": 643.5, "epoch": 2.7311529933481156, "grad_norm": 0.0, "kl": 0.20541690289974213, "learning_rate": 2.1604508261272426e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9854 }, { "completion_length": 666.0, "epoch": 2.731430155210643, "grad_norm": 0.0, "kl": 0.2073081135749817, "learning_rate": 2.1600170964803823e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9855 }, { "completion_length": 566.0, "epoch": 2.7317073170731705, "grad_norm": 0.0, "kl": 0.18757428228855133, "learning_rate": 2.159583377259384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9856 }, { "completion_length": 576.0, "epoch": 2.7319844789356984, "grad_norm": 0.0, "kl": 0.31298744678497314, "learning_rate": 2.1591496684775466e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9857 }, { "completion_length": 647.75, "epoch": 2.7322616407982263, "grad_norm": 0.0, "kl": 0.1790662556886673, "learning_rate": 2.1587159701481718e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9858 }, { "completion_length": 595.25, "epoch": 2.7325388026607538, "grad_norm": 0.0, "kl": 0.16997204720973969, "learning_rate": 2.158282282284558e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9859 }, { "completion_length": 633.25, "epoch": 2.7328159645232817, "grad_norm": 0.0, "kl": 0.16696280241012573, "learning_rate": 2.1578486049000043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9860 }, { "completion_length": 604.25, "epoch": 2.733093126385809, "grad_norm": 0.0, "kl": 0.18653224408626556, "learning_rate": 2.157414938007812e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9861 }, { "completion_length": 601.5, "epoch": 2.733370288248337, "grad_norm": 0.0, "kl": 3.501964569091797, "learning_rate": 2.156981281621277e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9862 }, { "completion_length": 599.5, "epoch": 2.7336474501108645, "grad_norm": 0.0, "kl": 0.22827373445034027, "learning_rate": 2.1565476357537e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9863 }, { "completion_length": 525.5, "epoch": 2.7339246119733924, "grad_norm": 0.0, "kl": 0.18192701041698456, "learning_rate": 2.1561140004183783e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9864 }, { "completion_length": 597.75, "epoch": 2.7342017738359203, "grad_norm": 0.0, "kl": 0.41930004954338074, "learning_rate": 2.15568037562861e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9865 }, { "completion_length": 578.75, "epoch": 2.734478935698448, "grad_norm": 0.0, "kl": 0.1758892685174942, "learning_rate": 2.1552467613976915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9866 }, { "completion_length": 585.5, "epoch": 2.7347560975609757, "grad_norm": 0.0, "kl": 0.2750791609287262, "learning_rate": 2.1548131577389213e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9867 }, { "completion_length": 619.75, "epoch": 2.735033259423503, "grad_norm": 0.0, "kl": 0.1929425448179245, "learning_rate": 2.154379564665595e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9868 }, { "completion_length": 736.75, "epoch": 2.735310421286031, "grad_norm": 0.0, "kl": 0.15715278685092926, "learning_rate": 2.15394598219101e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9869 }, { "completion_length": 606.75, "epoch": 2.7355875831485585, "grad_norm": 0.0, "kl": 0.20973344147205353, "learning_rate": 2.1535124103284626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9870 }, { "completion_length": 622.25, "epoch": 2.7358647450110865, "grad_norm": 0.0, "kl": 0.3309064507484436, "learning_rate": 2.153078849091247e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9871 }, { "completion_length": 671.25, "epoch": 2.7361419068736144, "grad_norm": 0.0, "kl": 0.21240629255771637, "learning_rate": 2.152645298492662e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9872 }, { "completion_length": 546.0, "epoch": 2.736419068736142, "grad_norm": 0.0, "kl": 0.5073580741882324, "learning_rate": 2.1522117585459987e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9873 }, { "completion_length": 617.0, "epoch": 2.7366962305986697, "grad_norm": 0.0, "kl": 2062569344.0, "learning_rate": 2.1517782292645555e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9874 }, { "completion_length": 689.25, "epoch": 2.736973392461197, "grad_norm": 0.0, "kl": 4.427074477070746e+16, "learning_rate": 2.151344710661624e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9875 }, { "completion_length": 663.0, "epoch": 2.737250554323725, "grad_norm": 0.0, "kl": 0.17845968902111053, "learning_rate": 2.1509112027505006e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9876 }, { "completion_length": 663.5, "epoch": 2.7375277161862526, "grad_norm": 0.0, "kl": 0.19666466116905212, "learning_rate": 2.1504777055444786e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9877 }, { "completion_length": 752.5, "epoch": 2.7378048780487805, "grad_norm": 0.0, "kl": 0.14392587542533875, "learning_rate": 2.1500442190568513e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9878 }, { "completion_length": 532.5, "epoch": 2.7380820399113084, "grad_norm": 0.0, "kl": 78.76061248779297, "learning_rate": 2.149610743300912e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9879 }, { "completion_length": 685.25, "epoch": 2.738359201773836, "grad_norm": 0.0, "kl": 0.20984336733818054, "learning_rate": 2.149177278289954e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9880 }, { "completion_length": 672.0, "epoch": 2.7386363636363638, "grad_norm": 0.0, "kl": 0.23152989149093628, "learning_rate": 2.148743824037269e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9881 }, { "completion_length": 545.0, "epoch": 2.738913525498891, "grad_norm": 0.0, "kl": 1.618804693222046, "learning_rate": 2.1483103805561493e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9882 }, { "completion_length": 619.25, "epoch": 2.739190687361419, "grad_norm": 0.0, "kl": 0.19218379259109497, "learning_rate": 2.1478769478598872e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9883 }, { "completion_length": 569.75, "epoch": 2.7394678492239466, "grad_norm": 0.0, "kl": 0.2474462389945984, "learning_rate": 2.147443525961775e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9884 }, { "completion_length": 688.75, "epoch": 2.7397450110864745, "grad_norm": 0.0, "kl": 0.21080006659030914, "learning_rate": 2.147010114875103e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9885 }, { "completion_length": 612.0, "epoch": 2.7400221729490024, "grad_norm": 0.0, "kl": 9.611245155334473, "learning_rate": 2.1465767146131633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9886 }, { "completion_length": 681.25, "epoch": 2.74029933481153, "grad_norm": 0.0, "kl": 0.17330220341682434, "learning_rate": 2.1461433251892445e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9887 }, { "completion_length": 653.5, "epoch": 2.740576496674058, "grad_norm": 0.0, "kl": 0.18827751278877258, "learning_rate": 2.1457099466166382e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9888 }, { "completion_length": 647.25, "epoch": 2.7408536585365852, "grad_norm": 0.0, "kl": 0.16829529404640198, "learning_rate": 2.1452765789086342e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9889 }, { "completion_length": 603.0, "epoch": 2.741130820399113, "grad_norm": 5.468051910400391, "kl": 668200811364352.0, "learning_rate": 2.144843222078521e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9890 }, { "completion_length": 566.75, "epoch": 2.7414079822616406, "grad_norm": 0.0, "kl": 1.2997627599650816e+16, "learning_rate": 2.14440987613959e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9891 }, { "completion_length": 542.5, "epoch": 2.7416851441241685, "grad_norm": 0.0, "kl": 0.22488880157470703, "learning_rate": 2.143976541105128e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9892 }, { "completion_length": 598.0, "epoch": 2.7419623059866964, "grad_norm": 0.0, "kl": 0.2180129736661911, "learning_rate": 2.1435432169884256e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9893 }, { "completion_length": 590.0, "epoch": 2.742239467849224, "grad_norm": 0.0, "kl": 0.20833344757556915, "learning_rate": 2.143109903802768e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9894 }, { "completion_length": 581.5, "epoch": 2.742516629711752, "grad_norm": 1.367085337638855, "kl": 405001188933632.0, "learning_rate": 2.142676601561447e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9895 }, { "completion_length": 587.5, "epoch": 2.7427937915742793, "grad_norm": 0.0, "kl": 0.2550238072872162, "learning_rate": 2.142243310277747e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9896 }, { "completion_length": 659.75, "epoch": 2.743070953436807, "grad_norm": 0.0, "kl": 0.22102786600589752, "learning_rate": 2.1418100299649563e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9897 }, { "completion_length": 590.25, "epoch": 2.7433481152993346, "grad_norm": 0.0, "kl": 0.1698899269104004, "learning_rate": 2.1413767606363626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9898 }, { "completion_length": 672.0, "epoch": 2.7436252771618626, "grad_norm": 0.0, "kl": 0.2501981556415558, "learning_rate": 2.1409435023052518e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9899 }, { "completion_length": 620.75, "epoch": 2.7439024390243905, "grad_norm": 0.0, "kl": 0.4987376034259796, "learning_rate": 2.1405102549849107e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9900 }, { "completion_length": 581.0, "epoch": 2.744179600886918, "grad_norm": 0.0, "kl": 0.2863509953022003, "learning_rate": 2.140077018688623e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9901 }, { "completion_length": 592.5, "epoch": 2.744456762749446, "grad_norm": 0.0, "kl": 0.1602102518081665, "learning_rate": 2.1396437934296775e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9902 }, { "completion_length": 613.0, "epoch": 2.7447339246119733, "grad_norm": 0.0, "kl": 0.21985605359077454, "learning_rate": 2.1392105792213565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9903 }, { "completion_length": 615.0, "epoch": 2.745011086474501, "grad_norm": 0.0, "kl": 0.217572420835495, "learning_rate": 2.1387773760769477e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9904 }, { "completion_length": 744.0, "epoch": 2.7452882483370287, "grad_norm": 0.0, "kl": 0.1713595986366272, "learning_rate": 2.1383441840097325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9905 }, { "completion_length": 535.5, "epoch": 2.7455654101995566, "grad_norm": 0.9207650423049927, "kl": 3555486720.0, "learning_rate": 2.137911003032997e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9906 }, { "completion_length": 614.5, "epoch": 2.7458425720620845, "grad_norm": 0.0, "kl": 3021688807620608.0, "learning_rate": 2.137477833160026e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9907 }, { "completion_length": 660.25, "epoch": 2.746119733924612, "grad_norm": 0.0, "kl": 3.0423617362976074, "learning_rate": 2.137044674404101e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9908 }, { "completion_length": 606.0, "epoch": 2.7463968957871394, "grad_norm": 0.4338701665401459, "kl": 8676591616.0, "learning_rate": 2.1366115267785064e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9909 }, { "completion_length": 704.5, "epoch": 2.7466740576496673, "grad_norm": 0.0, "kl": 0.1623930037021637, "learning_rate": 2.136178390296524e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9910 }, { "completion_length": 558.5, "epoch": 2.7469512195121952, "grad_norm": 0.0, "kl": 0.1929602175951004, "learning_rate": 2.135745264971437e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9911 }, { "completion_length": 616.75, "epoch": 2.7472283813747227, "grad_norm": 0.0, "kl": 0.21964064240455627, "learning_rate": 2.135312150816528e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9912 }, { "completion_length": 593.0, "epoch": 2.7475055432372506, "grad_norm": 2.716132402420044, "kl": 3528822439804928.0, "learning_rate": 2.134879047845077e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9913 }, { "completion_length": 719.75, "epoch": 2.7477827050997785, "grad_norm": 0.0, "kl": 0.1978306919336319, "learning_rate": 2.134445956070368e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9914 }, { "completion_length": 634.5, "epoch": 2.748059866962306, "grad_norm": 0.0, "kl": 0.42297449707984924, "learning_rate": 2.1340128755056804e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9915 }, { "completion_length": 530.0, "epoch": 2.7483370288248334, "grad_norm": 0.0, "kl": 0.20375703275203705, "learning_rate": 2.1335798061642956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9916 }, { "completion_length": 749.25, "epoch": 2.7486141906873613, "grad_norm": 0.3464076817035675, "kl": 0.163777694106102, "learning_rate": 2.1331467480594936e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9917 }, { "completion_length": 528.25, "epoch": 2.7488913525498893, "grad_norm": 0.0, "kl": 7.222821590872883e+16, "learning_rate": 2.132713701204556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9918 }, { "completion_length": 795.75, "epoch": 2.7491685144124167, "grad_norm": 0.0, "kl": 0.14795532822608948, "learning_rate": 2.1322806656127597e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9919 }, { "completion_length": 573.0, "epoch": 2.7494456762749446, "grad_norm": 0.0, "kl": 0.396849125623703, "learning_rate": 2.131847641297386e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9920 }, { "completion_length": 555.5, "epoch": 2.7497228381374725, "grad_norm": 0.0, "kl": 0.1906818449497223, "learning_rate": 2.1314146282717144e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9921 }, { "completion_length": 598.5, "epoch": 2.75, "grad_norm": 0.0, "kl": 0.24271228909492493, "learning_rate": 2.1309816265490214e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9922 }, { "completion_length": 575.75, "epoch": 2.7502771618625275, "grad_norm": 0.0, "kl": 0.18653184175491333, "learning_rate": 2.130548636142589e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9923 }, { "completion_length": 608.5, "epoch": 2.7505543237250554, "grad_norm": 0.0, "kl": 0.2022586464881897, "learning_rate": 2.1301156570656913e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9924 }, { "completion_length": 724.0, "epoch": 2.7508314855875833, "grad_norm": 0.0, "kl": 0.1603301465511322, "learning_rate": 2.1296826893316086e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9925 }, { "completion_length": 561.5, "epoch": 2.7511086474501107, "grad_norm": 0.48874548077583313, "kl": 1.7170010086572032e+16, "learning_rate": 2.1292497329536167e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9926 }, { "completion_length": 651.0, "epoch": 2.7513858093126387, "grad_norm": 0.0, "kl": 0.16779087483882904, "learning_rate": 2.1288167879449934e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9927 }, { "completion_length": 570.25, "epoch": 2.7516629711751666, "grad_norm": 0.0, "kl": 0.2724553048610687, "learning_rate": 2.128383854319016e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9928 }, { "completion_length": 603.25, "epoch": 2.751940133037694, "grad_norm": 0.0, "kl": 0.16863256692886353, "learning_rate": 2.127950932088959e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9929 }, { "completion_length": 635.5, "epoch": 2.7522172949002215, "grad_norm": 0.0, "kl": 0.2631780505180359, "learning_rate": 2.1275180212680995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9930 }, { "completion_length": 633.0, "epoch": 2.7524944567627494, "grad_norm": 0.0, "kl": 0.15483388304710388, "learning_rate": 2.1270851218697127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9931 }, { "completion_length": 554.25, "epoch": 2.7527716186252773, "grad_norm": 0.0, "kl": 0.2293548732995987, "learning_rate": 2.1266522339070746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9932 }, { "completion_length": 592.75, "epoch": 2.7530487804878048, "grad_norm": 0.0, "kl": 0.1929192990064621, "learning_rate": 2.1262193573934577e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9933 }, { "completion_length": 609.25, "epoch": 2.7533259423503327, "grad_norm": 0.0, "kl": 0.3564806580543518, "learning_rate": 2.1257864923421405e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9934 }, { "completion_length": 618.0, "epoch": 2.7536031042128606, "grad_norm": 0.0, "kl": 0.1973428577184677, "learning_rate": 2.1253536387663926e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9935 }, { "completion_length": 599.0, "epoch": 2.753880266075388, "grad_norm": 1.8046848773956299, "kl": 15760.6572265625, "learning_rate": 2.124920796679491e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9936 }, { "completion_length": 715.25, "epoch": 2.7541574279379155, "grad_norm": 0.0, "kl": 2.7656590938568115, "learning_rate": 2.124487966094709e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9937 }, { "completion_length": 646.5, "epoch": 2.7544345898004434, "grad_norm": 0.0, "kl": 0.21672658622264862, "learning_rate": 2.124055147025318e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9938 }, { "completion_length": 606.25, "epoch": 2.7547117516629713, "grad_norm": 0.0, "kl": 0.19064608216285706, "learning_rate": 2.1236223394845928e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9939 }, { "completion_length": 554.0, "epoch": 2.754988913525499, "grad_norm": 0.0, "kl": 0.19258299469947815, "learning_rate": 2.123189543485804e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9940 }, { "completion_length": 617.25, "epoch": 2.7552660753880267, "grad_norm": 0.0, "kl": 0.1832873523235321, "learning_rate": 2.1227567590422248e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9941 }, { "completion_length": 593.0, "epoch": 2.755543237250554, "grad_norm": 0.0, "kl": 0.2235681414604187, "learning_rate": 2.122323986167126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9942 }, { "completion_length": 596.0, "epoch": 2.755820399113082, "grad_norm": 0.0, "kl": 0.18955637514591217, "learning_rate": 2.121891224873779e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9943 }, { "completion_length": 631.25, "epoch": 2.7560975609756095, "grad_norm": 0.0, "kl": 22001643159552.0, "learning_rate": 2.1214584751754567e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9944 }, { "completion_length": 583.25, "epoch": 2.7563747228381374, "grad_norm": 0.0, "kl": 9180408.0, "learning_rate": 2.121025737085427e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9945 }, { "completion_length": 558.25, "epoch": 2.7566518847006654, "grad_norm": 0.0, "kl": 0.21481572091579437, "learning_rate": 2.1205930106169626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9946 }, { "completion_length": 549.0, "epoch": 2.756929046563193, "grad_norm": 1.0304981470108032, "kl": 6081539.0, "learning_rate": 2.120160295783331e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9947 }, { "completion_length": 539.5, "epoch": 2.7572062084257207, "grad_norm": 0.0, "kl": 0.22286216914653778, "learning_rate": 2.1197275925978045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9948 }, { "completion_length": 606.0, "epoch": 2.757483370288248, "grad_norm": 0.0, "kl": 0.6620931029319763, "learning_rate": 2.1192949010736497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9949 }, { "completion_length": 545.5, "epoch": 2.757760532150776, "grad_norm": 0.0, "kl": 0.2302994579076767, "learning_rate": 2.1188622212241366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9950 }, { "completion_length": 658.5, "epoch": 2.7580376940133036, "grad_norm": 0.0, "kl": 0.22216574847698212, "learning_rate": 2.1184295530625346e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9951 }, { "completion_length": 612.25, "epoch": 2.7583148558758315, "grad_norm": 0.0, "kl": 0.30711591243743896, "learning_rate": 2.1179968966021096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9952 }, { "completion_length": 593.0, "epoch": 2.7585920177383594, "grad_norm": 0.0, "kl": 0.18636126816272736, "learning_rate": 2.117564251856132e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9953 }, { "completion_length": 671.5, "epoch": 2.758869179600887, "grad_norm": 0.0, "kl": 0.20946751534938812, "learning_rate": 2.117131618837867e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9954 }, { "completion_length": 663.5, "epoch": 2.7591463414634148, "grad_norm": 0.0, "kl": 0.1685495227575302, "learning_rate": 2.1166989975605834e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9955 }, { "completion_length": 641.0, "epoch": 2.759423503325942, "grad_norm": 0.0, "kl": 13.445454597473145, "learning_rate": 2.116266388037546e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9956 }, { "completion_length": 957.25, "epoch": 2.75970066518847, "grad_norm": 0.0, "kl": 14346047.0, "learning_rate": 2.1158337902820227e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9957 }, { "completion_length": 723.0, "epoch": 2.7599778270509976, "grad_norm": 0.0, "kl": 0.1487313061952591, "learning_rate": 2.1154012043072792e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9958 }, { "completion_length": 579.25, "epoch": 2.7602549889135255, "grad_norm": 0.0, "kl": 0.20526939630508423, "learning_rate": 2.11496863012658e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9959 }, { "completion_length": 728.5, "epoch": 2.7605321507760534, "grad_norm": 0.0, "kl": 8375831552.0, "learning_rate": 2.114536067753192e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9960 }, { "completion_length": 636.5, "epoch": 2.760809312638581, "grad_norm": 0.0, "kl": 0.17981509864330292, "learning_rate": 2.1141035172003794e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9961 }, { "completion_length": 676.25, "epoch": 2.761086474501109, "grad_norm": 0.0, "kl": 0.21858972311019897, "learning_rate": 2.113670978481407e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9962 }, { "completion_length": 575.5, "epoch": 2.7613636363636362, "grad_norm": 0.0, "kl": 0.2113102823495865, "learning_rate": 2.113238451609538e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9963 }, { "completion_length": 548.25, "epoch": 2.761640798226164, "grad_norm": 0.0, "kl": 0.23708990216255188, "learning_rate": 2.112805936598037e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9964 }, { "completion_length": 626.75, "epoch": 2.7619179600886916, "grad_norm": 0.0, "kl": 0.1881338208913803, "learning_rate": 2.1123734334601663e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9965 }, { "completion_length": 584.0, "epoch": 2.7621951219512195, "grad_norm": 0.0, "kl": 0.2240614891052246, "learning_rate": 2.11194094220919e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9966 }, { "completion_length": 713.25, "epoch": 2.7624722838137474, "grad_norm": 0.0, "kl": 0.22176386415958405, "learning_rate": 2.111508462858372e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9967 }, { "completion_length": 729.75, "epoch": 2.762749445676275, "grad_norm": 0.0, "kl": 46989.10546875, "learning_rate": 2.1110759954209727e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9968 }, { "completion_length": 602.25, "epoch": 2.763026607538803, "grad_norm": 0.0, "kl": 0.18686679005622864, "learning_rate": 2.110643539910255e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9969 }, { "completion_length": 658.25, "epoch": 2.7633037694013303, "grad_norm": 0.0, "kl": 0.20022548735141754, "learning_rate": 2.11021109633948e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9970 }, { "completion_length": 641.25, "epoch": 2.763580931263858, "grad_norm": 0.0, "kl": 2.822616858702643e+16, "learning_rate": 2.1097786647219095e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9971 }, { "completion_length": 669.25, "epoch": 2.7638580931263856, "grad_norm": 0.0, "kl": 0.16112473607063293, "learning_rate": 2.1093462450708037e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9972 }, { "completion_length": 530.5, "epoch": 2.7641352549889135, "grad_norm": 0.0, "kl": 0.2475530207157135, "learning_rate": 2.1089138373994226e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9973 }, { "completion_length": 602.25, "epoch": 2.7644124168514415, "grad_norm": 0.0, "kl": 0.18218651413917542, "learning_rate": 2.1084814417210293e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9974 }, { "completion_length": 605.0, "epoch": 2.764689578713969, "grad_norm": 0.0, "kl": 0.18816359341144562, "learning_rate": 2.1080490580488796e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9975 }, { "completion_length": 601.5, "epoch": 2.764966740576497, "grad_norm": 0.0, "kl": 0.197356715798378, "learning_rate": 2.1076166863962358e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9976 }, { "completion_length": 618.0, "epoch": 2.7652439024390243, "grad_norm": 0.0, "kl": 0.18879646062850952, "learning_rate": 2.1071843267763557e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9977 }, { "completion_length": 601.75, "epoch": 2.765521064301552, "grad_norm": 0.0, "kl": 0.21427439153194427, "learning_rate": 2.1067519792024988e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9978 }, { "completion_length": 626.75, "epoch": 2.7657982261640797, "grad_norm": 3.9640660285949707, "kl": 1191737984.0, "learning_rate": 2.106319643687922e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9979 }, { "completion_length": 640.5, "epoch": 2.7660753880266076, "grad_norm": 0.7365825772285461, "kl": 17398514.0, "learning_rate": 2.105887320245884e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9980 }, { "completion_length": 594.5, "epoch": 2.7663525498891355, "grad_norm": 0.0, "kl": 536199200.0, "learning_rate": 2.105455008889643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9981 }, { "completion_length": 706.0, "epoch": 2.766629711751663, "grad_norm": 0.0, "kl": 0.16400237381458282, "learning_rate": 2.1050227096324556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9982 }, { "completion_length": 582.0, "epoch": 2.766906873614191, "grad_norm": 0.0, "kl": 0.22450707852840424, "learning_rate": 2.1045904224875788e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9983 }, { "completion_length": 648.0, "epoch": 2.7671840354767183, "grad_norm": 0.0, "kl": 0.21171505749225616, "learning_rate": 2.1041581474682678e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9984 }, { "completion_length": 602.75, "epoch": 2.7674611973392462, "grad_norm": 0.0, "kl": 0.2418314665555954, "learning_rate": 2.1037258845877813e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9985 }, { "completion_length": 594.75, "epoch": 2.7677383592017737, "grad_norm": 0.0, "kl": 0.2042640596628189, "learning_rate": 2.1032936338593716e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9986 }, { "completion_length": 598.25, "epoch": 2.7680155210643016, "grad_norm": 0.0, "kl": 0.17931663990020752, "learning_rate": 2.102861395296297e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9987 }, { "completion_length": 638.75, "epoch": 2.7682926829268295, "grad_norm": 0.0, "kl": 700487744.0, "learning_rate": 2.102429168911812e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9988 }, { "completion_length": 622.5, "epoch": 2.768569844789357, "grad_norm": 0.0, "kl": 0.20653486251831055, "learning_rate": 2.1019969547191692e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9989 }, { "completion_length": 602.75, "epoch": 2.7688470066518844, "grad_norm": 0.0, "kl": 0.2005297839641571, "learning_rate": 2.101564752731625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9990 }, { "completion_length": 651.5, "epoch": 2.7691241685144123, "grad_norm": 0.0, "kl": 0.17671038210391998, "learning_rate": 2.101132562962432e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9991 }, { "completion_length": 634.0, "epoch": 2.7694013303769403, "grad_norm": 0.0, "kl": 0.20611357688903809, "learning_rate": 2.1007003854248447e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9992 }, { "completion_length": 532.75, "epoch": 2.7696784922394677, "grad_norm": 0.0, "kl": 15931.861328125, "learning_rate": 2.1002682201321146e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9993 }, { "completion_length": 662.75, "epoch": 2.7699556541019956, "grad_norm": 0.0, "kl": 0.3334946930408478, "learning_rate": 2.0998360670974955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9994 }, { "completion_length": 637.25, "epoch": 2.7702328159645235, "grad_norm": 0.0, "kl": 0.20660321414470673, "learning_rate": 2.09940392633424e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9995 }, { "completion_length": 583.75, "epoch": 2.770509977827051, "grad_norm": 0.0, "kl": 0.2205406129360199, "learning_rate": 2.0989717978555992e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9996 }, { "completion_length": 617.5, "epoch": 2.7707871396895785, "grad_norm": 0.0, "kl": 92584.5859375, "learning_rate": 2.0985396816748262e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9997 }, { "completion_length": 656.0, "epoch": 2.7710643015521064, "grad_norm": 0.0, "kl": 0.26136675477027893, "learning_rate": 2.0981075778051698e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9998 }, { "completion_length": 596.5, "epoch": 2.7713414634146343, "grad_norm": 0.0, "kl": 181.92271423339844, "learning_rate": 2.097675486259883e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 9999 }, { "completion_length": 603.25, "epoch": 2.7716186252771617, "grad_norm": 0.0, "kl": 0.1951555609703064, "learning_rate": 2.0972434070522154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10000 }, { "completion_length": 666.75, "epoch": 2.7718957871396896, "grad_norm": 0.0, "kl": 0.19437360763549805, "learning_rate": 2.096811340195417e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10001 }, { "completion_length": 664.25, "epoch": 2.7721729490022176, "grad_norm": 0.0, "kl": 0.48917579650878906, "learning_rate": 2.096379285702737e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10002 }, { "completion_length": 675.5, "epoch": 2.772450110864745, "grad_norm": 0.0, "kl": 0.19541946053504944, "learning_rate": 2.095947243587425e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10003 }, { "completion_length": 642.25, "epoch": 2.7727272727272725, "grad_norm": 0.0, "kl": 0.23410463333129883, "learning_rate": 2.0955152138627316e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10004 }, { "completion_length": 627.25, "epoch": 2.7730044345898004, "grad_norm": 0.0, "kl": 0.2017362415790558, "learning_rate": 2.0950831965419026e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10005 }, { "completion_length": 607.0, "epoch": 2.7732815964523283, "grad_norm": 0.0, "kl": 0.20637214183807373, "learning_rate": 2.094651191638189e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10006 }, { "completion_length": 598.5, "epoch": 2.7735587583148558, "grad_norm": 0.4523630142211914, "kl": 0.2048424482345581, "learning_rate": 2.0942191991648357e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10007 }, { "completion_length": 634.0, "epoch": 2.7738359201773837, "grad_norm": 0.0, "kl": 27.164329528808594, "learning_rate": 2.093787219135093e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10008 }, { "completion_length": 625.75, "epoch": 2.7741130820399116, "grad_norm": 0.0, "kl": 0.18545806407928467, "learning_rate": 2.093355251562205e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10009 }, { "completion_length": 596.0, "epoch": 2.774390243902439, "grad_norm": 0.0, "kl": 0.2585362493991852, "learning_rate": 2.09292329645942e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10010 }, { "completion_length": 593.0, "epoch": 2.7746674057649665, "grad_norm": 1.43728768825531, "kl": 291976540848128.0, "learning_rate": 2.0924913538399854e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10011 }, { "completion_length": 619.0, "epoch": 2.7749445676274944, "grad_norm": 0.0, "kl": 234895.046875, "learning_rate": 2.092059423717145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10012 }, { "completion_length": 606.0, "epoch": 2.7752217294900223, "grad_norm": 0.0, "kl": 0.19432245194911957, "learning_rate": 2.0916275061041453e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10013 }, { "completion_length": 654.25, "epoch": 2.77549889135255, "grad_norm": 0.4312937259674072, "kl": 0.3463698923587799, "learning_rate": 2.091195601014231e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10014 }, { "completion_length": 708.25, "epoch": 2.7757760532150777, "grad_norm": 0.0, "kl": 0.20372579991817474, "learning_rate": 2.090763708460647e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10015 }, { "completion_length": 611.25, "epoch": 2.776053215077605, "grad_norm": 0.0, "kl": 0.21024644374847412, "learning_rate": 2.090331828456637e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10016 }, { "completion_length": 708.25, "epoch": 2.776330376940133, "grad_norm": 0.0, "kl": 0.7701776623725891, "learning_rate": 2.0898999610154463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10017 }, { "completion_length": 616.75, "epoch": 2.7766075388026605, "grad_norm": 0.0, "kl": 0.20922428369522095, "learning_rate": 2.0894681061503185e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10018 }, { "completion_length": 691.5, "epoch": 2.7768847006651884, "grad_norm": 0.0, "kl": 0.21483056247234344, "learning_rate": 2.089036263874495e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10019 }, { "completion_length": 545.25, "epoch": 2.7771618625277164, "grad_norm": 0.0, "kl": 0.22199349105358124, "learning_rate": 2.0886044342012207e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10020 }, { "completion_length": 701.75, "epoch": 2.777439024390244, "grad_norm": 0.0, "kl": 0.17581944167613983, "learning_rate": 2.0881726171437363e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10021 }, { "completion_length": 605.5, "epoch": 2.7777161862527717, "grad_norm": 0.0, "kl": 0.2071891874074936, "learning_rate": 2.087740812715285e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10022 }, { "completion_length": 498.75, "epoch": 2.777993348115299, "grad_norm": 0.0, "kl": 0.7778609991073608, "learning_rate": 2.087309020929108e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10023 }, { "completion_length": 583.0, "epoch": 2.778270509977827, "grad_norm": 0.0, "kl": 0.2901279032230377, "learning_rate": 2.0868772417984457e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10024 }, { "completion_length": 501.25, "epoch": 2.7785476718403546, "grad_norm": 0.0, "kl": 300282528.0, "learning_rate": 2.0864454753365414e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10025 }, { "completion_length": 558.5, "epoch": 2.7788248337028825, "grad_norm": 0.0, "kl": 0.27133846282958984, "learning_rate": 2.0860137215566325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10026 }, { "completion_length": 603.25, "epoch": 2.7791019955654104, "grad_norm": 0.0, "kl": 0.1958979368209839, "learning_rate": 2.0855819804719615e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10027 }, { "completion_length": 698.75, "epoch": 2.779379157427938, "grad_norm": 0.0, "kl": 0.6458902359008789, "learning_rate": 2.0851502520957667e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10028 }, { "completion_length": 695.25, "epoch": 2.7796563192904657, "grad_norm": 0.0, "kl": 0.18187958002090454, "learning_rate": 2.0847185364412885e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10029 }, { "completion_length": 588.75, "epoch": 2.779933481152993, "grad_norm": 0.0, "kl": 0.18113581836223602, "learning_rate": 2.084286833521765e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10030 }, { "completion_length": 562.5, "epoch": 2.780210643015521, "grad_norm": 0.0, "kl": 0.2265770584344864, "learning_rate": 2.0838551433504354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10031 }, { "completion_length": 554.0, "epoch": 2.7804878048780486, "grad_norm": 0.0, "kl": 1.7006317377090454, "learning_rate": 2.083423465940537e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10032 }, { "completion_length": 663.0, "epoch": 2.7807649667405765, "grad_norm": 0.0, "kl": 0.1385386735200882, "learning_rate": 2.0829918013053075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10033 }, { "completion_length": 589.5, "epoch": 2.7810421286031044, "grad_norm": 0.41657590866088867, "kl": 0.2057252824306488, "learning_rate": 2.082560149457986e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10034 }, { "completion_length": 551.75, "epoch": 2.781319290465632, "grad_norm": 0.0, "kl": 0.22222112119197845, "learning_rate": 2.0821285104118065e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10035 }, { "completion_length": 517.25, "epoch": 2.7815964523281598, "grad_norm": 0.0, "kl": 0.3086934983730316, "learning_rate": 2.0816968841800094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10036 }, { "completion_length": 704.0, "epoch": 2.7818736141906872, "grad_norm": 0.0, "kl": 2.305490255355835, "learning_rate": 2.081265270775827e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10037 }, { "completion_length": 743.75, "epoch": 2.782150776053215, "grad_norm": 0.0, "kl": 0.2185838371515274, "learning_rate": 2.080833670212498e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10038 }, { "completion_length": 626.0, "epoch": 2.7824279379157426, "grad_norm": 0.0, "kl": 0.1844746619462967, "learning_rate": 2.0804020825032555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10039 }, { "completion_length": 634.5, "epoch": 2.7827050997782705, "grad_norm": 0.0, "kl": 0.24019896984100342, "learning_rate": 2.0799705076613365e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10040 }, { "completion_length": 539.25, "epoch": 2.7829822616407984, "grad_norm": 0.0, "kl": 2.495826482772827, "learning_rate": 2.0795389456999744e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10041 }, { "completion_length": 598.75, "epoch": 2.783259423503326, "grad_norm": 0.0, "kl": 0.24942758679389954, "learning_rate": 2.079107396632404e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10042 }, { "completion_length": 701.25, "epoch": 2.783536585365854, "grad_norm": 0.0, "kl": 0.18604502081871033, "learning_rate": 2.078675860471859e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10043 }, { "completion_length": 541.5, "epoch": 2.7838137472283813, "grad_norm": 0.0, "kl": 5765636038852608.0, "learning_rate": 2.078244337231572e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10044 }, { "completion_length": 594.0, "epoch": 2.784090909090909, "grad_norm": 0.0, "kl": 0.3168553113937378, "learning_rate": 2.077812826924778e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10045 }, { "completion_length": 691.5, "epoch": 2.7843680709534366, "grad_norm": 0.0, "kl": 0.17393730580806732, "learning_rate": 2.0773813295647064e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10046 }, { "completion_length": 631.5, "epoch": 2.7846452328159645, "grad_norm": 0.0, "kl": 0.24640357494354248, "learning_rate": 2.0769498451645927e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10047 }, { "completion_length": 520.75, "epoch": 2.7849223946784925, "grad_norm": 0.0, "kl": 869089344.0, "learning_rate": 2.0765183737376674e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10048 }, { "completion_length": 581.75, "epoch": 2.78519955654102, "grad_norm": 0.0, "kl": 0.2350654900074005, "learning_rate": 2.076086915297161e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10049 }, { "completion_length": 526.75, "epoch": 2.785476718403548, "grad_norm": 0.0, "kl": 0.3829571008682251, "learning_rate": 2.075655469856307e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10050 }, { "completion_length": 541.75, "epoch": 2.7857538802660753, "grad_norm": 0.0, "kl": 7.158230866198528e+16, "learning_rate": 2.0752240374283334e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10051 }, { "completion_length": 669.0, "epoch": 2.786031042128603, "grad_norm": 0.0, "kl": 0.22882844507694244, "learning_rate": 2.0747926180264725e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10052 }, { "completion_length": 650.0, "epoch": 2.7863082039911307, "grad_norm": 0.0, "kl": 0.1840040236711502, "learning_rate": 2.0743612116639523e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10053 }, { "completion_length": 538.0, "epoch": 2.7865853658536586, "grad_norm": 0.0, "kl": 0.19996121525764465, "learning_rate": 2.0739298183540027e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10054 }, { "completion_length": 632.5, "epoch": 2.7868625277161865, "grad_norm": 0.0, "kl": 0.19328728318214417, "learning_rate": 2.073498438109855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10055 }, { "completion_length": 565.0, "epoch": 2.787139689578714, "grad_norm": 0.0, "kl": 0.22694560885429382, "learning_rate": 2.073067070944734e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10056 }, { "completion_length": 476.75, "epoch": 2.787416851441242, "grad_norm": 0.5537981986999512, "kl": 2065758848.0, "learning_rate": 2.072635716871871e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10057 }, { "completion_length": 618.75, "epoch": 2.7876940133037693, "grad_norm": 0.0, "kl": 0.21216952800750732, "learning_rate": 2.0722043759044924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10058 }, { "completion_length": 614.25, "epoch": 2.787971175166297, "grad_norm": 0.0, "kl": 0.20215487480163574, "learning_rate": 2.071773048055826e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10059 }, { "completion_length": 594.25, "epoch": 2.7882483370288247, "grad_norm": 0.0, "kl": 0.22153240442276, "learning_rate": 2.071341733339099e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10060 }, { "completion_length": 563.5, "epoch": 2.7885254988913526, "grad_norm": 0.0, "kl": 0.1939733922481537, "learning_rate": 2.0709104317675383e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10061 }, { "completion_length": 463.25, "epoch": 2.7888026607538805, "grad_norm": 0.0, "kl": 0.25282156467437744, "learning_rate": 2.070479143354369e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10062 }, { "completion_length": 665.25, "epoch": 2.789079822616408, "grad_norm": 0.0, "kl": 0.256096214056015, "learning_rate": 2.0700478681128177e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10063 }, { "completion_length": 659.75, "epoch": 2.7893569844789354, "grad_norm": 0.0, "kl": 0.16145841777324677, "learning_rate": 2.0696166060561104e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10064 }, { "completion_length": 603.5, "epoch": 2.7896341463414633, "grad_norm": 0.40159517526626587, "kl": 2.3454766581835366e+17, "learning_rate": 2.0691853571974707e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10065 }, { "completion_length": 558.25, "epoch": 2.7899113082039912, "grad_norm": 0.0, "kl": 0.41297125816345215, "learning_rate": 2.0687541215501245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10066 }, { "completion_length": 612.5, "epoch": 2.7901884700665187, "grad_norm": 0.0, "kl": 0.20091061294078827, "learning_rate": 2.0683228991272946e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10067 }, { "completion_length": 529.5, "epoch": 2.7904656319290466, "grad_norm": 0.0, "kl": 0.183590367436409, "learning_rate": 2.067891689942207e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10068 }, { "completion_length": 815.75, "epoch": 2.7907427937915745, "grad_norm": 1.2812306880950928, "kl": 4051.623779296875, "learning_rate": 2.067460494008082e-06, "loss": -0.0, "reward": 1.71875, "reward_std": 0.0625, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.71875, "step": 10069 }, { "completion_length": 626.75, "epoch": 2.791019955654102, "grad_norm": 0.962535560131073, "kl": 131570848.0, "learning_rate": 2.0670293113381453e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10070 }, { "completion_length": 522.25, "epoch": 2.7912971175166295, "grad_norm": 0.0, "kl": 0.19874581694602966, "learning_rate": 2.066598141945619e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10071 }, { "completion_length": 656.25, "epoch": 2.7915742793791574, "grad_norm": 0.0, "kl": 0.19069604575634003, "learning_rate": 2.066166985843724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10072 }, { "completion_length": 568.25, "epoch": 2.7918514412416853, "grad_norm": 0.39556941390037537, "kl": 1.3360173520047309e+17, "learning_rate": 2.065735843045683e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10073 }, { "completion_length": 559.75, "epoch": 2.7921286031042127, "grad_norm": 0.0, "kl": 0.2726527750492096, "learning_rate": 2.0653047135647173e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10074 }, { "completion_length": 609.75, "epoch": 2.7924057649667406, "grad_norm": 0.0, "kl": 0.23946762084960938, "learning_rate": 2.064873597414048e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10075 }, { "completion_length": 716.0, "epoch": 2.7926829268292686, "grad_norm": 0.0, "kl": 0.161609947681427, "learning_rate": 2.064442494606895e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10076 }, { "completion_length": 704.5, "epoch": 2.792960088691796, "grad_norm": 0.0, "kl": 0.17368777096271515, "learning_rate": 2.064011405156478e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10077 }, { "completion_length": 617.75, "epoch": 2.7932372505543235, "grad_norm": 0.0, "kl": 6.445390224456787, "learning_rate": 2.063580329076019e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10078 }, { "completion_length": 676.25, "epoch": 2.7935144124168514, "grad_norm": 0.0, "kl": 0.2076815962791443, "learning_rate": 2.063149266378735e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10079 }, { "completion_length": 623.5, "epoch": 2.7937915742793793, "grad_norm": 0.0, "kl": 770.9624633789062, "learning_rate": 2.0627182170778466e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10080 }, { "completion_length": 665.5, "epoch": 2.7940687361419068, "grad_norm": 0.0, "kl": 0.18202830851078033, "learning_rate": 2.062287181186571e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10081 }, { "completion_length": 522.25, "epoch": 2.7943458980044347, "grad_norm": 0.0, "kl": 0.20549984276294708, "learning_rate": 2.061856158718127e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10082 }, { "completion_length": 621.25, "epoch": 2.7946230598669626, "grad_norm": 0.0, "kl": 0.18255868554115295, "learning_rate": 2.061425149685731e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10083 }, { "completion_length": 670.5, "epoch": 2.79490022172949, "grad_norm": 0.0, "kl": 0.2425089031457901, "learning_rate": 2.060994154102602e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10084 }, { "completion_length": 523.25, "epoch": 2.7951773835920175, "grad_norm": 0.0, "kl": 0.25987714529037476, "learning_rate": 2.060563171981956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10085 }, { "completion_length": 646.75, "epoch": 2.7954545454545454, "grad_norm": 0.0, "kl": 0.2205289900302887, "learning_rate": 2.0601322033370086e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10086 }, { "completion_length": 558.0, "epoch": 2.7957317073170733, "grad_norm": 0.0, "kl": 0.7147790193557739, "learning_rate": 2.0597012481809786e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10087 }, { "completion_length": 626.5, "epoch": 2.796008869179601, "grad_norm": 0.0, "kl": 0.19652502238750458, "learning_rate": 2.0592703065270777e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10088 }, { "completion_length": 598.25, "epoch": 2.7962860310421287, "grad_norm": 0.40822938084602356, "kl": 37302669312.0, "learning_rate": 2.058839378388525e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10089 }, { "completion_length": 566.5, "epoch": 2.796563192904656, "grad_norm": 0.0, "kl": 0.1966588795185089, "learning_rate": 2.0584084637785316e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10090 }, { "completion_length": 636.75, "epoch": 2.796840354767184, "grad_norm": 0.0, "kl": 184.56690979003906, "learning_rate": 2.0579775627103144e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10091 }, { "completion_length": 696.75, "epoch": 2.7971175166297115, "grad_norm": 0.0, "kl": 0.18230058252811432, "learning_rate": 2.057546675197087e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10092 }, { "completion_length": 612.75, "epoch": 2.7973946784922394, "grad_norm": 0.41935932636260986, "kl": 0.1892424374818802, "learning_rate": 2.057115801252062e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10093 }, { "completion_length": 640.5, "epoch": 2.7976718403547673, "grad_norm": 0.0, "kl": 1.4834016561508179, "learning_rate": 2.056684940888454e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10094 }, { "completion_length": 606.0, "epoch": 2.797949002217295, "grad_norm": 0.0, "kl": 5337145942212608.0, "learning_rate": 2.0562540941194734e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10095 }, { "completion_length": 654.25, "epoch": 2.7982261640798227, "grad_norm": 0.0, "kl": 0.1927940994501114, "learning_rate": 2.0558232609583343e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10096 }, { "completion_length": 738.25, "epoch": 2.79850332594235, "grad_norm": 0.0, "kl": 0.2335577756166458, "learning_rate": 2.0553924414182473e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10097 }, { "completion_length": 539.0, "epoch": 2.798780487804878, "grad_norm": 0.0, "kl": 0.21822106838226318, "learning_rate": 2.0549616355124266e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10098 }, { "completion_length": 613.75, "epoch": 2.7990576496674056, "grad_norm": 0.0, "kl": 51268.8046875, "learning_rate": 2.054530843254079e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10099 }, { "completion_length": 616.0, "epoch": 2.7993348115299335, "grad_norm": 0.0, "kl": 0.1725975126028061, "learning_rate": 2.054100064656418e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10100 }, { "completion_length": 556.75, "epoch": 2.7996119733924614, "grad_norm": 0.0, "kl": 0.2663412392139435, "learning_rate": 2.053669299732654e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10101 }, { "completion_length": 740.75, "epoch": 2.799889135254989, "grad_norm": 0.0, "kl": 0.1684947907924652, "learning_rate": 2.053238548495995e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10102 }, { "completion_length": 655.25, "epoch": 2.8001662971175167, "grad_norm": 0.5598799586296082, "kl": 2002108416.0, "learning_rate": 2.052807810959652e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10103 }, { "completion_length": 692.25, "epoch": 2.800443458980044, "grad_norm": 0.0, "kl": 0.17631252110004425, "learning_rate": 2.0523770871368327e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10104 }, { "completion_length": 607.5, "epoch": 2.800720620842572, "grad_norm": 0.0, "kl": 0.2205425649881363, "learning_rate": 2.0519463770407467e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10105 }, { "completion_length": 534.5, "epoch": 2.8009977827050996, "grad_norm": 0.0, "kl": 0.6785025596618652, "learning_rate": 2.0515156806846003e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10106 }, { "completion_length": 562.5, "epoch": 2.8012749445676275, "grad_norm": 0.0, "kl": 0.20420919358730316, "learning_rate": 2.051084998081602e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10107 }, { "completion_length": 583.0, "epoch": 2.8015521064301554, "grad_norm": 0.0, "kl": 0.19609983265399933, "learning_rate": 2.0506543292449614e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10108 }, { "completion_length": 738.25, "epoch": 2.801829268292683, "grad_norm": 0.0, "kl": 0.2800046503543854, "learning_rate": 2.0502236741878816e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10109 }, { "completion_length": 662.75, "epoch": 2.8021064301552108, "grad_norm": 0.0, "kl": 0.17781803011894226, "learning_rate": 2.0497930329235717e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10110 }, { "completion_length": 543.0, "epoch": 2.8023835920177382, "grad_norm": 0.0, "kl": 0.25001445412635803, "learning_rate": 2.049362405465236e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10111 }, { "completion_length": 590.0, "epoch": 2.802660753880266, "grad_norm": 0.0, "kl": 0.20623283088207245, "learning_rate": 2.048931791826081e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10112 }, { "completion_length": 535.25, "epoch": 2.8029379157427936, "grad_norm": 0.0, "kl": 0.1963447779417038, "learning_rate": 2.0485011920193115e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10113 }, { "completion_length": 641.0, "epoch": 2.8032150776053215, "grad_norm": 0.0, "kl": 0.16568517684936523, "learning_rate": 2.0480706060581317e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10114 }, { "completion_length": 616.5, "epoch": 2.8034922394678494, "grad_norm": 0.0, "kl": 6003.35205078125, "learning_rate": 2.0476400339557475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10115 }, { "completion_length": 521.75, "epoch": 2.803769401330377, "grad_norm": 10.314651489257812, "kl": 0.20266248285770416, "learning_rate": 2.0472094757253603e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10116 }, { "completion_length": 550.0, "epoch": 2.804046563192905, "grad_norm": 0.0, "kl": 0.2181350141763687, "learning_rate": 2.0467789313801767e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10117 }, { "completion_length": 661.5, "epoch": 2.8043237250554323, "grad_norm": 0.34128618240356445, "kl": 601112896.0, "learning_rate": 2.0463484009333964e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10118 }, { "completion_length": 609.75, "epoch": 2.80460088691796, "grad_norm": 0.0, "kl": 964592576.0, "learning_rate": 2.045917884398225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10119 }, { "completion_length": 537.5, "epoch": 2.8048780487804876, "grad_norm": 0.0, "kl": 426198765142016.0, "learning_rate": 2.0454873817878614e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10120 }, { "completion_length": 636.5, "epoch": 2.8051552106430155, "grad_norm": 0.0, "kl": 1.4882429838180542, "learning_rate": 2.0450568931155094e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10121 }, { "completion_length": 631.0, "epoch": 2.8054323725055434, "grad_norm": 0.0, "kl": 0.19766885042190552, "learning_rate": 2.044626418394371e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10122 }, { "completion_length": 592.75, "epoch": 2.805709534368071, "grad_norm": 0.0, "kl": 0.20094802975654602, "learning_rate": 2.0441959576376454e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10123 }, { "completion_length": 585.5, "epoch": 2.805986696230599, "grad_norm": 2.8151750564575195, "kl": 2.485031784991949e+16, "learning_rate": 2.0437655108585342e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10124 }, { "completion_length": 528.75, "epoch": 2.8062638580931263, "grad_norm": 0.0, "kl": 0.21461071074008942, "learning_rate": 2.0433350780702364e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10125 }, { "completion_length": 727.0, "epoch": 2.806541019955654, "grad_norm": 0.4246498644351959, "kl": 0.2318369597196579, "learning_rate": 2.0429046592859524e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10126 }, { "completion_length": 689.25, "epoch": 2.8068181818181817, "grad_norm": 0.0, "kl": 0.17965635657310486, "learning_rate": 2.042474254518881e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10127 }, { "completion_length": 579.75, "epoch": 2.8070953436807096, "grad_norm": 0.0, "kl": 0.19248060882091522, "learning_rate": 2.0420438637822206e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10128 }, { "completion_length": 627.5, "epoch": 2.8073725055432375, "grad_norm": 0.0, "kl": 0.18716612458229065, "learning_rate": 2.0416134870891697e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10129 }, { "completion_length": 543.0, "epoch": 2.807649667405765, "grad_norm": 0.0, "kl": 0.20907337963581085, "learning_rate": 2.041183124452926e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10130 }, { "completion_length": 627.75, "epoch": 2.807926829268293, "grad_norm": 0.0, "kl": 0.2305316925048828, "learning_rate": 2.0407527758866884e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10131 }, { "completion_length": 571.75, "epoch": 2.8082039911308203, "grad_norm": 0.0, "kl": 0.2036651074886322, "learning_rate": 2.040322441403652e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10132 }, { "completion_length": 645.75, "epoch": 2.808481152993348, "grad_norm": 2.668924331665039, "kl": 129600072.0, "learning_rate": 2.0398921210170146e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10133 }, { "completion_length": 655.5, "epoch": 2.8087583148558757, "grad_norm": 0.0, "kl": 3.200030667387699e+16, "learning_rate": 2.0394618147399713e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10134 }, { "completion_length": 628.75, "epoch": 2.8090354767184036, "grad_norm": 0.0, "kl": 0.17269636690616608, "learning_rate": 2.039031522585719e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10135 }, { "completion_length": 582.5, "epoch": 2.8093126385809315, "grad_norm": 0.0, "kl": 0.2465883493423462, "learning_rate": 2.038601244567452e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10136 }, { "completion_length": 589.75, "epoch": 2.809589800443459, "grad_norm": 0.0, "kl": 14.832816123962402, "learning_rate": 2.0381709806983647e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10137 }, { "completion_length": 629.0, "epoch": 2.8098669623059864, "grad_norm": 0.3718973398208618, "kl": 2077420928.0, "learning_rate": 2.037740730991654e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10138 }, { "completion_length": 665.0, "epoch": 2.8101441241685143, "grad_norm": 0.0, "kl": 0.1823156625032425, "learning_rate": 2.03731049546051e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10139 }, { "completion_length": 683.25, "epoch": 2.8104212860310422, "grad_norm": 0.6790546774864197, "kl": 1883201536.0, "learning_rate": 2.03688027411813e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10140 }, { "completion_length": 685.75, "epoch": 2.8106984478935697, "grad_norm": 0.0, "kl": 6.293578147888184, "learning_rate": 2.0364500669777045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10141 }, { "completion_length": 582.25, "epoch": 2.8109756097560976, "grad_norm": 0.0, "kl": 0.21767398715019226, "learning_rate": 2.036019874052428e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10142 }, { "completion_length": 577.75, "epoch": 2.8112527716186255, "grad_norm": 0.0, "kl": 0.19840702414512634, "learning_rate": 2.035589695355491e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10143 }, { "completion_length": 561.0, "epoch": 2.811529933481153, "grad_norm": 0.0, "kl": 1.0562943783534592e+16, "learning_rate": 2.0351595309000864e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10144 }, { "completion_length": 613.25, "epoch": 2.8118070953436805, "grad_norm": 0.0, "kl": 0.1931617110967636, "learning_rate": 2.0347293806994057e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10145 }, { "completion_length": 700.5, "epoch": 2.8120842572062084, "grad_norm": 0.0, "kl": 1.4566001892089844, "learning_rate": 2.034299244766639e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10146 }, { "completion_length": 538.5, "epoch": 2.8123614190687363, "grad_norm": 0.0, "kl": 0.2434922307729721, "learning_rate": 2.0338691231149776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10147 }, { "completion_length": 652.5, "epoch": 2.8126385809312637, "grad_norm": 0.0, "kl": 0.2391177862882614, "learning_rate": 2.0334390157576105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10148 }, { "completion_length": 593.75, "epoch": 2.8129157427937916, "grad_norm": 0.0, "kl": 6001992192.0, "learning_rate": 2.033008922707729e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10149 }, { "completion_length": 610.25, "epoch": 2.8131929046563195, "grad_norm": 0.0, "kl": 0.20465266704559326, "learning_rate": 2.03257884397852e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10150 }, { "completion_length": 542.5, "epoch": 2.813470066518847, "grad_norm": 0.48580431938171387, "kl": 1.393220818829312e+16, "learning_rate": 2.0321487795831733e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10151 }, { "completion_length": 567.75, "epoch": 2.8137472283813745, "grad_norm": 0.0, "kl": 0.20493529736995697, "learning_rate": 2.0317187295348785e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10152 }, { "completion_length": 603.5, "epoch": 2.8140243902439024, "grad_norm": 5.112425804138184, "kl": 9746092543442944.0, "learning_rate": 2.0312886938468215e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10153 }, { "completion_length": 647.5, "epoch": 2.8143015521064303, "grad_norm": 0.0, "kl": 232487968.0, "learning_rate": 2.030858672532191e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10154 }, { "completion_length": 601.0, "epoch": 2.8145787139689578, "grad_norm": 0.0, "kl": 0.20479212701320648, "learning_rate": 2.030428665604173e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10155 }, { "completion_length": 639.25, "epoch": 2.8148558758314857, "grad_norm": 0.40472403168678284, "kl": 4434761216.0, "learning_rate": 2.0299986730759553e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10156 }, { "completion_length": 683.5, "epoch": 2.8151330376940136, "grad_norm": 0.0, "kl": 0.43118536472320557, "learning_rate": 2.029568694960722e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10157 }, { "completion_length": 580.0, "epoch": 2.815410199556541, "grad_norm": 0.0, "kl": 0.5253823399543762, "learning_rate": 2.0291387312716606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10158 }, { "completion_length": 691.0, "epoch": 2.8156873614190685, "grad_norm": 0.0, "kl": 0.18506835401058197, "learning_rate": 2.028708782021955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10159 }, { "completion_length": 560.5, "epoch": 2.8159645232815964, "grad_norm": 0.0, "kl": 0.1787560135126114, "learning_rate": 2.0282788472247904e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10160 }, { "completion_length": 614.5, "epoch": 2.8162416851441243, "grad_norm": 0.0, "kl": 0.18358632922172546, "learning_rate": 2.027848926893352e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10161 }, { "completion_length": 710.75, "epoch": 2.816518847006652, "grad_norm": 0.0, "kl": 0.20261628925800323, "learning_rate": 2.0274190210408225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10162 }, { "completion_length": 587.5, "epoch": 2.8167960088691797, "grad_norm": 3.0631439685821533, "kl": 22075.0859375, "learning_rate": 2.026989129680386e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10163 }, { "completion_length": 632.75, "epoch": 2.817073170731707, "grad_norm": 1.7217504978179932, "kl": 148885980839936.0, "learning_rate": 2.026559252825225e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10164 }, { "completion_length": 592.75, "epoch": 2.817350332594235, "grad_norm": 0.0, "kl": 0.26123034954071045, "learning_rate": 2.0261293904885225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10165 }, { "completion_length": 594.75, "epoch": 2.8176274944567625, "grad_norm": 0.0, "kl": 0.19118261337280273, "learning_rate": 2.02569954268346e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10166 }, { "completion_length": 589.25, "epoch": 2.8179046563192904, "grad_norm": 0.0, "kl": 0.25022217631340027, "learning_rate": 2.0252697094232188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10167 }, { "completion_length": 586.25, "epoch": 2.8181818181818183, "grad_norm": 0.407297283411026, "kl": 1762350976.0, "learning_rate": 2.0248398907209827e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10168 }, { "completion_length": 697.5, "epoch": 2.818458980044346, "grad_norm": 0.0, "kl": 0.14097851514816284, "learning_rate": 2.0244100865899284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10169 }, { "completion_length": 512.25, "epoch": 2.8187361419068737, "grad_norm": 0.0, "kl": 0.18589860200881958, "learning_rate": 2.02398029704324e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10170 }, { "completion_length": 605.75, "epoch": 2.819013303769401, "grad_norm": 0.0, "kl": 0.23353219032287598, "learning_rate": 2.0235505220940946e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10171 }, { "completion_length": 580.75, "epoch": 2.819290465631929, "grad_norm": 0.0, "kl": 0.21148498356342316, "learning_rate": 2.0231207617556736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10172 }, { "completion_length": 613.75, "epoch": 2.8195676274944566, "grad_norm": 0.0, "kl": 0.18746888637542725, "learning_rate": 2.0226910160411545e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10173 }, { "completion_length": 552.5, "epoch": 2.8198447893569845, "grad_norm": 0.0, "kl": 0.44030970335006714, "learning_rate": 2.0222612849637162e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10174 }, { "completion_length": 570.0, "epoch": 2.8201219512195124, "grad_norm": 0.0, "kl": 0.19009998440742493, "learning_rate": 2.021831568536538e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10175 }, { "completion_length": 511.25, "epoch": 2.82039911308204, "grad_norm": 0.0, "kl": 0.1921854019165039, "learning_rate": 2.021401866772795e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10176 }, { "completion_length": 575.75, "epoch": 2.8206762749445677, "grad_norm": 0.0, "kl": 0.19028976559638977, "learning_rate": 2.020972179685667e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10177 }, { "completion_length": 609.75, "epoch": 2.820953436807095, "grad_norm": 0.0, "kl": 0.19797104597091675, "learning_rate": 2.0205425072883293e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10178 }, { "completion_length": 500.5, "epoch": 2.821230598669623, "grad_norm": 0.0, "kl": 0.4060506224632263, "learning_rate": 2.0201128495939584e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10179 }, { "completion_length": 520.5, "epoch": 2.8215077605321506, "grad_norm": 0.0, "kl": 0.17676810920238495, "learning_rate": 2.019683206615729e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10180 }, { "completion_length": 642.25, "epoch": 2.8217849223946785, "grad_norm": 0.0, "kl": 0.16591672599315643, "learning_rate": 2.0192535783668185e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10181 }, { "completion_length": 562.75, "epoch": 2.8220620842572064, "grad_norm": 0.0, "kl": 217.4403839111328, "learning_rate": 2.0188239648604014e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10182 }, { "completion_length": 491.75, "epoch": 2.822339246119734, "grad_norm": 0.0, "kl": 0.23777693510055542, "learning_rate": 2.0183943661096507e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10183 }, { "completion_length": 610.25, "epoch": 2.8226164079822618, "grad_norm": 0.4530014097690582, "kl": 4229165824.0, "learning_rate": 2.0179647821277422e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10184 }, { "completion_length": 681.75, "epoch": 2.8228935698447892, "grad_norm": 0.0, "kl": 0.16229765117168427, "learning_rate": 2.017535212927848e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10185 }, { "completion_length": 619.25, "epoch": 2.823170731707317, "grad_norm": 0.0, "kl": 1501546.5, "learning_rate": 2.0171056585231425e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10186 }, { "completion_length": 577.5, "epoch": 2.8234478935698446, "grad_norm": 0.0, "kl": 0.21866180002689362, "learning_rate": 2.016676118926797e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10187 }, { "completion_length": 615.0, "epoch": 2.8237250554323725, "grad_norm": 0.0, "kl": 0.20387916266918182, "learning_rate": 2.016246594151984e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10188 }, { "completion_length": 591.0, "epoch": 2.8240022172949004, "grad_norm": 0.0, "kl": 0.17506828904151917, "learning_rate": 2.015817084211877e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10189 }, { "completion_length": 636.75, "epoch": 2.824279379157428, "grad_norm": 0.0, "kl": 6735091712.0, "learning_rate": 2.0153875891196447e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10190 }, { "completion_length": 598.5, "epoch": 2.824556541019956, "grad_norm": 0.0, "kl": 0.17265622317790985, "learning_rate": 2.0149581088884595e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10191 }, { "completion_length": 546.25, "epoch": 2.8248337028824833, "grad_norm": 0.0, "kl": 0.20143325626850128, "learning_rate": 2.014528643531491e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10192 }, { "completion_length": 604.5, "epoch": 2.825110864745011, "grad_norm": 0.0, "kl": 0.2310786098241806, "learning_rate": 2.0140991930619103e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10193 }, { "completion_length": 590.25, "epoch": 2.8253880266075386, "grad_norm": 0.0, "kl": 0.18077775835990906, "learning_rate": 2.0136697574928853e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10194 }, { "completion_length": 625.0, "epoch": 2.8256651884700665, "grad_norm": 0.0, "kl": 0.4959372580051422, "learning_rate": 2.0132403368375862e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10195 }, { "completion_length": 587.75, "epoch": 2.8259423503325944, "grad_norm": 0.0, "kl": 0.2002500742673874, "learning_rate": 2.012810931109181e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10196 }, { "completion_length": 651.5, "epoch": 2.826219512195122, "grad_norm": 0.0, "kl": 0.20952732861042023, "learning_rate": 2.0123815403208374e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10197 }, { "completion_length": 613.0, "epoch": 2.82649667405765, "grad_norm": 0.0, "kl": 0.24627800285816193, "learning_rate": 2.0119521644857243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10198 }, { "completion_length": 638.0, "epoch": 2.8267738359201773, "grad_norm": 0.0, "kl": 0.18577933311462402, "learning_rate": 2.011522803617007e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10199 }, { "completion_length": 609.0, "epoch": 2.827050997782705, "grad_norm": 0.0, "kl": 0.32810547947883606, "learning_rate": 2.011093457727855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10200 }, { "completion_length": 526.5, "epoch": 2.8273281596452327, "grad_norm": 0.0, "kl": 0.1937255710363388, "learning_rate": 2.010664126831431e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10201 }, { "completion_length": 518.5, "epoch": 2.8276053215077606, "grad_norm": 0.0, "kl": 0.4452638030052185, "learning_rate": 2.0102348109409046e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10202 }, { "completion_length": 623.5, "epoch": 2.8278824833702885, "grad_norm": 0.0, "kl": 0.2501371502876282, "learning_rate": 2.0098055100694373e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10203 }, { "completion_length": 586.75, "epoch": 2.828159645232816, "grad_norm": 0.3892580568790436, "kl": 0.20254290103912354, "learning_rate": 2.0093762242301964e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10204 }, { "completion_length": 561.5, "epoch": 2.828436807095344, "grad_norm": 0.0, "kl": 0.1987265646457672, "learning_rate": 2.0089469534363464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10205 }, { "completion_length": 658.5, "epoch": 2.8287139689578713, "grad_norm": 0.0, "kl": 0.2968871295452118, "learning_rate": 2.0085176977010503e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10206 }, { "completion_length": 570.75, "epoch": 2.828991130820399, "grad_norm": 0.0, "kl": 0.1990395188331604, "learning_rate": 2.008088457037472e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10207 }, { "completion_length": 682.0, "epoch": 2.8292682926829267, "grad_norm": 0.0, "kl": 0.1895492523908615, "learning_rate": 2.007659231458774e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10208 }, { "completion_length": 578.25, "epoch": 2.8295454545454546, "grad_norm": 0.0, "kl": 0.19471125304698944, "learning_rate": 2.00723002097812e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10209 }, { "completion_length": 593.0, "epoch": 2.8298226164079825, "grad_norm": 2.6461098194122314, "kl": 1485315899392.0, "learning_rate": 2.0068008256086697e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10210 }, { "completion_length": 594.0, "epoch": 2.83009977827051, "grad_norm": 0.0, "kl": 0.18686895072460175, "learning_rate": 2.0063716453635875e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10211 }, { "completion_length": 530.0, "epoch": 2.8303769401330374, "grad_norm": 0.0, "kl": 8212324352.0, "learning_rate": 2.005942480256034e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10212 }, { "completion_length": 550.25, "epoch": 2.8306541019955653, "grad_norm": 0.0, "kl": 0.19429145753383636, "learning_rate": 2.005513330299169e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10213 }, { "completion_length": 589.0, "epoch": 2.8309312638580932, "grad_norm": 0.0, "kl": 0.17870590090751648, "learning_rate": 2.0050841955061535e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10214 }, { "completion_length": 658.0, "epoch": 2.8312084257206207, "grad_norm": 0.0, "kl": 0.20548118650913239, "learning_rate": 2.0046550758901463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10215 }, { "completion_length": 627.0, "epoch": 2.8314855875831486, "grad_norm": 0.0, "kl": 0.17630264163017273, "learning_rate": 2.004225971464308e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10216 }, { "completion_length": 645.25, "epoch": 2.8317627494456765, "grad_norm": 0.0, "kl": 0.16406285762786865, "learning_rate": 2.0037968822417963e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10217 }, { "completion_length": 563.5, "epoch": 2.832039911308204, "grad_norm": 0.46805891394615173, "kl": 0.20983345806598663, "learning_rate": 2.0033678082357694e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10218 }, { "completion_length": 646.75, "epoch": 2.8323170731707314, "grad_norm": 0.0, "kl": 0.6372479796409607, "learning_rate": 2.0029387494593872e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10219 }, { "completion_length": 606.5, "epoch": 2.8325942350332594, "grad_norm": 0.43346020579338074, "kl": 0.19933371245861053, "learning_rate": 2.0025097059258047e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10220 }, { "completion_length": 631.25, "epoch": 2.8328713968957873, "grad_norm": 0.0, "kl": 0.20254063606262207, "learning_rate": 2.0020806776481818e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10221 }, { "completion_length": 616.0, "epoch": 2.8331485587583147, "grad_norm": 0.0, "kl": 0.1798570454120636, "learning_rate": 2.001651664639671e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10222 }, { "completion_length": 575.0, "epoch": 2.8334257206208426, "grad_norm": 0.0, "kl": 0.1911650002002716, "learning_rate": 2.001222666913432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10223 }, { "completion_length": 618.75, "epoch": 2.8337028824833705, "grad_norm": 0.3638322353363037, "kl": 0.1758962869644165, "learning_rate": 2.0007936844826185e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10224 }, { "completion_length": 607.75, "epoch": 2.833980044345898, "grad_norm": 0.0, "kl": 0.16802802681922913, "learning_rate": 2.0003647173603864e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10225 }, { "completion_length": 611.25, "epoch": 2.8342572062084255, "grad_norm": 0.0, "kl": 0.21086353063583374, "learning_rate": 1.9999357655598894e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10226 }, { "completion_length": 647.75, "epoch": 2.8345343680709534, "grad_norm": 0.0, "kl": 0.17280639708042145, "learning_rate": 1.999506829094282e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10227 }, { "completion_length": 652.25, "epoch": 2.8348115299334813, "grad_norm": 0.0, "kl": 0.3344341218471527, "learning_rate": 1.9990779079767193e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10228 }, { "completion_length": 663.25, "epoch": 2.8350886917960088, "grad_norm": 0.3889295756816864, "kl": 4583884377292800.0, "learning_rate": 1.9986490022203526e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10229 }, { "completion_length": 570.0, "epoch": 2.8353658536585367, "grad_norm": 0.0, "kl": 0.20300401747226715, "learning_rate": 1.998220111838336e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10230 }, { "completion_length": 619.5, "epoch": 2.8356430155210646, "grad_norm": 0.0, "kl": 0.1737157702445984, "learning_rate": 1.99779123684382e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10231 }, { "completion_length": 592.5, "epoch": 2.835920177383592, "grad_norm": 1.275026798248291, "kl": 10878767104.0, "learning_rate": 1.9973623772499593e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10232 }, { "completion_length": 711.0, "epoch": 2.8361973392461195, "grad_norm": 0.0, "kl": 0.19114629924297333, "learning_rate": 1.9969335330699017e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10233 }, { "completion_length": 618.75, "epoch": 2.8364745011086474, "grad_norm": 0.0, "kl": 0.3714618682861328, "learning_rate": 1.996504704316801e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10234 }, { "completion_length": 580.0, "epoch": 2.8367516629711753, "grad_norm": 0.0, "kl": 0.44829389452934265, "learning_rate": 1.996075891003807e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10235 }, { "completion_length": 686.0, "epoch": 2.837028824833703, "grad_norm": 0.0, "kl": 0.1928187608718872, "learning_rate": 1.9956470931440685e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10236 }, { "completion_length": 518.25, "epoch": 2.8373059866962307, "grad_norm": 0.0, "kl": 0.20493124425411224, "learning_rate": 1.995218310750736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10237 }, { "completion_length": 535.5, "epoch": 2.837583148558758, "grad_norm": 0.0, "kl": 0.2281835526227951, "learning_rate": 1.9947895438369573e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10238 }, { "completion_length": 548.5, "epoch": 2.837860310421286, "grad_norm": 0.0, "kl": 1.0902388095855713, "learning_rate": 1.9943607924158826e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10239 }, { "completion_length": 555.75, "epoch": 2.8381374722838135, "grad_norm": 0.0, "kl": 0.18649151921272278, "learning_rate": 1.993932056500658e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10240 }, { "completion_length": 574.25, "epoch": 2.8384146341463414, "grad_norm": 0.0, "kl": 0.42333871126174927, "learning_rate": 1.993503336104432e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10241 }, { "completion_length": 659.0, "epoch": 2.8386917960088693, "grad_norm": 0.0, "kl": 0.22080892324447632, "learning_rate": 1.993074631240353e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10242 }, { "completion_length": 560.75, "epoch": 2.838968957871397, "grad_norm": 0.42236924171447754, "kl": 4169619212337152.0, "learning_rate": 1.9926459419215654e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10243 }, { "completion_length": 567.0, "epoch": 2.8392461197339247, "grad_norm": 0.0, "kl": 0.1705223023891449, "learning_rate": 1.9922172681612163e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10244 }, { "completion_length": 520.75, "epoch": 2.839523281596452, "grad_norm": 0.0, "kl": 0.2768048942089081, "learning_rate": 1.9917886099724516e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10245 }, { "completion_length": 655.25, "epoch": 2.83980044345898, "grad_norm": 0.0, "kl": 0.20193907618522644, "learning_rate": 1.991359967368416e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10246 }, { "completion_length": 568.0, "epoch": 2.8400776053215075, "grad_norm": 0.0, "kl": 0.202043354511261, "learning_rate": 1.990931340362254e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10247 }, { "completion_length": 495.75, "epoch": 2.8403547671840355, "grad_norm": 0.0, "kl": 0.28300729393959045, "learning_rate": 1.9905027289671104e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10248 }, { "completion_length": 671.25, "epoch": 2.8406319290465634, "grad_norm": 0.0, "kl": 0.18964293599128723, "learning_rate": 1.9900741331961287e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10249 }, { "completion_length": 537.5, "epoch": 2.840909090909091, "grad_norm": 0.0, "kl": 0.2173180878162384, "learning_rate": 1.9896455530624513e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10250 }, { "completion_length": 609.0, "epoch": 2.8411862527716187, "grad_norm": 6.3928446769714355, "kl": 119859962707968.0, "learning_rate": 1.9892169885792233e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10251 }, { "completion_length": 560.75, "epoch": 2.841463414634146, "grad_norm": 0.0, "kl": 0.25201407074928284, "learning_rate": 1.988788439759584e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10252 }, { "completion_length": 563.5, "epoch": 2.841740576496674, "grad_norm": 0.0, "kl": 0.20871371030807495, "learning_rate": 1.988359906616678e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10253 }, { "completion_length": 611.0, "epoch": 2.8420177383592016, "grad_norm": 0.0, "kl": 0.1970028579235077, "learning_rate": 1.9879313891636437e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10254 }, { "completion_length": 503.0, "epoch": 2.8422949002217295, "grad_norm": 0.5277625322341919, "kl": 0.20865003764629364, "learning_rate": 1.987502887413625e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10255 }, { "completion_length": 634.5, "epoch": 2.8425720620842574, "grad_norm": 0.3512512743473053, "kl": 3890983424.0, "learning_rate": 1.98707440137976e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10256 }, { "completion_length": 530.75, "epoch": 2.842849223946785, "grad_norm": 0.0, "kl": 1.5851935148239136, "learning_rate": 1.9866459310751894e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10257 }, { "completion_length": 638.0, "epoch": 2.8431263858093128, "grad_norm": 0.0, "kl": 0.17640210688114166, "learning_rate": 1.986217476513053e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10258 }, { "completion_length": 616.5, "epoch": 2.8434035476718402, "grad_norm": 0.0, "kl": 0.16851310431957245, "learning_rate": 1.985789037706489e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10259 }, { "completion_length": 580.25, "epoch": 2.843680709534368, "grad_norm": 0.0, "kl": 0.19047115743160248, "learning_rate": 1.9853606146686365e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10260 }, { "completion_length": 585.5, "epoch": 2.8439578713968956, "grad_norm": 0.0, "kl": 0.18346449732780457, "learning_rate": 1.984932207412632e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10261 }, { "completion_length": 571.5, "epoch": 2.8442350332594235, "grad_norm": 0.0, "kl": 0.2840348780155182, "learning_rate": 1.984503815951616e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10262 }, { "completion_length": 730.0, "epoch": 2.8445121951219514, "grad_norm": 0.0, "kl": 0.21219582855701447, "learning_rate": 1.984075440298722e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10263 }, { "completion_length": 636.25, "epoch": 2.844789356984479, "grad_norm": 0.0, "kl": 0.185189887881279, "learning_rate": 1.983647080467088e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10264 }, { "completion_length": 473.75, "epoch": 2.845066518847007, "grad_norm": 0.0, "kl": 0.23805418610572815, "learning_rate": 1.983218736469851e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10265 }, { "completion_length": 467.25, "epoch": 2.8453436807095343, "grad_norm": 0.0, "kl": 0.2732771933078766, "learning_rate": 1.982790408320145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10266 }, { "completion_length": 631.75, "epoch": 2.845620842572062, "grad_norm": 0.0, "kl": 0.18930961191654205, "learning_rate": 1.982362096031106e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10267 }, { "completion_length": 508.5, "epoch": 2.8458980044345896, "grad_norm": 0.0, "kl": 14816526336.0, "learning_rate": 1.981933799615868e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10268 }, { "completion_length": 497.75, "epoch": 2.8461751662971175, "grad_norm": 0.0, "kl": 0.28918761014938354, "learning_rate": 1.9815055190875656e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10269 }, { "completion_length": 654.75, "epoch": 2.8464523281596454, "grad_norm": 0.0, "kl": 0.1834862381219864, "learning_rate": 1.9810772544593314e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10270 }, { "completion_length": 608.5, "epoch": 2.846729490022173, "grad_norm": 0.0, "kl": 0.18383942544460297, "learning_rate": 1.980649005744299e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10271 }, { "completion_length": 659.5, "epoch": 2.847006651884701, "grad_norm": 0.0, "kl": 0.18325869739055634, "learning_rate": 1.9802207729556023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10272 }, { "completion_length": 539.0, "epoch": 2.8472838137472283, "grad_norm": 0.0, "kl": 0.23727655410766602, "learning_rate": 1.9797925561063706e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10273 }, { "completion_length": 596.25, "epoch": 2.847560975609756, "grad_norm": 0.0, "kl": 0.2767907679080963, "learning_rate": 1.979364355209739e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10274 }, { "completion_length": 600.75, "epoch": 2.8478381374722836, "grad_norm": 0.0, "kl": 0.18196038901805878, "learning_rate": 1.9789361702788356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10275 }, { "completion_length": 510.5, "epoch": 2.8481152993348116, "grad_norm": 0.0, "kl": 0.1921175718307495, "learning_rate": 1.9785080013267933e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10276 }, { "completion_length": 539.5, "epoch": 2.8483924611973395, "grad_norm": 0.0, "kl": 0.2183539867401123, "learning_rate": 1.97807984836674e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10277 }, { "completion_length": 549.0, "epoch": 2.848669623059867, "grad_norm": 0.0, "kl": 0.22101248800754547, "learning_rate": 1.9776517114118075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10278 }, { "completion_length": 537.25, "epoch": 2.848946784922395, "grad_norm": 0.0, "kl": 695870399971328.0, "learning_rate": 1.977223590475124e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10279 }, { "completion_length": 493.0, "epoch": 2.8492239467849223, "grad_norm": 0.0, "kl": 0.3706739544868469, "learning_rate": 1.976795485569817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10280 }, { "completion_length": 645.75, "epoch": 2.84950110864745, "grad_norm": 0.0, "kl": 0.20646104216575623, "learning_rate": 1.9763673967090184e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10281 }, { "completion_length": 555.0, "epoch": 2.8497782705099777, "grad_norm": 0.6348970532417297, "kl": 19404625920.0, "learning_rate": 1.975939323905852e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10282 }, { "completion_length": 591.5, "epoch": 2.8500554323725056, "grad_norm": 0.0, "kl": 0.20482562482357025, "learning_rate": 1.9755112671734472e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10283 }, { "completion_length": 578.75, "epoch": 2.8503325942350335, "grad_norm": 0.0, "kl": 0.25251463055610657, "learning_rate": 1.9750832265249286e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10284 }, { "completion_length": 591.75, "epoch": 2.850609756097561, "grad_norm": 0.0, "kl": 0.24645665287971497, "learning_rate": 1.9746552019734246e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10285 }, { "completion_length": 569.0, "epoch": 2.8508869179600884, "grad_norm": 0.0, "kl": 0.17877906560897827, "learning_rate": 1.9742271935320607e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10286 }, { "completion_length": 591.0, "epoch": 2.8511640798226163, "grad_norm": 0.4345161020755768, "kl": 2707594496.0, "learning_rate": 1.9737992012139614e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10287 }, { "completion_length": 612.5, "epoch": 2.8514412416851442, "grad_norm": 0.44237756729125977, "kl": 1.3653224292614144e+16, "learning_rate": 1.9733712250322518e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10288 }, { "completion_length": 593.0, "epoch": 2.8517184035476717, "grad_norm": 0.0, "kl": 0.2084357887506485, "learning_rate": 1.9729432650000554e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10289 }, { "completion_length": 561.25, "epoch": 2.8519955654101996, "grad_norm": 0.0, "kl": 0.21618874371051788, "learning_rate": 1.972515321130497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10290 }, { "completion_length": 576.25, "epoch": 2.8522727272727275, "grad_norm": 0.0, "kl": 0.22684280574321747, "learning_rate": 1.9720873934366993e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10291 }, { "completion_length": 671.5, "epoch": 2.852549889135255, "grad_norm": 0.5774203538894653, "kl": 726117908480000.0, "learning_rate": 1.9716594819317854e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10292 }, { "completion_length": 574.5, "epoch": 2.8528270509977824, "grad_norm": 0.43436262011528015, "kl": 46651658240.0, "learning_rate": 1.9712315866288765e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10293 }, { "completion_length": 583.0, "epoch": 2.8531042128603104, "grad_norm": 0.0, "kl": 3.355619078910771e+16, "learning_rate": 1.970803707541096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10294 }, { "completion_length": 563.75, "epoch": 2.8533813747228383, "grad_norm": 0.0, "kl": 0.24322432279586792, "learning_rate": 1.9703758446815645e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10295 }, { "completion_length": 552.5, "epoch": 2.8536585365853657, "grad_norm": 0.0, "kl": 0.2873859703540802, "learning_rate": 1.969947998063402e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10296 }, { "completion_length": 640.0, "epoch": 2.8539356984478936, "grad_norm": 0.0, "kl": 0.19591310620307922, "learning_rate": 1.96952016769973e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10297 }, { "completion_length": 629.75, "epoch": 2.8542128603104215, "grad_norm": 0.0, "kl": 0.17741672694683075, "learning_rate": 1.9690923536036673e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10298 }, { "completion_length": 511.5, "epoch": 2.854490022172949, "grad_norm": 0.0, "kl": 0.48166102170944214, "learning_rate": 1.9686645557883346e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10299 }, { "completion_length": 570.0, "epoch": 2.8547671840354765, "grad_norm": 0.57463139295578, "kl": 59631046656.0, "learning_rate": 1.9682367742668483e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10300 }, { "completion_length": 589.0, "epoch": 2.8550443458980044, "grad_norm": 0.0, "kl": 0.166433647274971, "learning_rate": 1.9678090090523284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10301 }, { "completion_length": 503.5, "epoch": 2.8553215077605323, "grad_norm": 0.0, "kl": 0.2175750881433487, "learning_rate": 1.967381260157893e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10302 }, { "completion_length": 640.0, "epoch": 2.8555986696230597, "grad_norm": 0.0, "kl": 0.1613416075706482, "learning_rate": 1.9669535275966577e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10303 }, { "completion_length": 578.5, "epoch": 2.8558758314855877, "grad_norm": 0.0, "kl": 0.28783950209617615, "learning_rate": 1.966525811381742e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10304 }, { "completion_length": 523.5, "epoch": 2.8561529933481156, "grad_norm": 0.0, "kl": 3968417711259648.0, "learning_rate": 1.9660981115262583e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10305 }, { "completion_length": 580.0, "epoch": 2.856430155210643, "grad_norm": 0.0, "kl": 0.16781601309776306, "learning_rate": 1.965670428043326e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10306 }, { "completion_length": 588.75, "epoch": 2.8567073170731705, "grad_norm": 0.9501398205757141, "kl": 8987080554184704.0, "learning_rate": 1.9652427609460587e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10307 }, { "completion_length": 465.5, "epoch": 2.8569844789356984, "grad_norm": 0.0, "kl": 0.24678084254264832, "learning_rate": 1.964815110247571e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10308 }, { "completion_length": 562.75, "epoch": 2.8572616407982263, "grad_norm": 0.0, "kl": 0.22788415849208832, "learning_rate": 1.964387475960978e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10309 }, { "completion_length": 572.0, "epoch": 2.8575388026607538, "grad_norm": 0.0, "kl": 0.191241055727005, "learning_rate": 1.9639598580993926e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10310 }, { "completion_length": 527.75, "epoch": 2.8578159645232817, "grad_norm": 0.0, "kl": 0.29732632637023926, "learning_rate": 1.963532256675929e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10311 }, { "completion_length": 592.25, "epoch": 2.858093126385809, "grad_norm": 0.5591398477554321, "kl": 1.728785539923968e+16, "learning_rate": 1.9631046717036985e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10312 }, { "completion_length": 585.75, "epoch": 2.858370288248337, "grad_norm": 0.0, "kl": 0.2025027573108673, "learning_rate": 1.9626771031958157e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10313 }, { "completion_length": 673.0, "epoch": 2.8586474501108645, "grad_norm": 0.0, "kl": 0.21602651476860046, "learning_rate": 1.9622495511653894e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10314 }, { "completion_length": 615.25, "epoch": 2.8589246119733924, "grad_norm": 0.0, "kl": 0.7492108345031738, "learning_rate": 1.961822015625533e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10315 }, { "completion_length": 565.5, "epoch": 2.8592017738359203, "grad_norm": 0.0, "kl": 0.2227257639169693, "learning_rate": 1.961394496589357e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10316 }, { "completion_length": 618.0, "epoch": 2.859478935698448, "grad_norm": 0.0, "kl": 0.22662273049354553, "learning_rate": 1.9609669940699707e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10317 }, { "completion_length": 567.5, "epoch": 2.8597560975609757, "grad_norm": 0.0, "kl": 0.25655436515808105, "learning_rate": 1.960539508080485e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10318 }, { "completion_length": 555.75, "epoch": 2.860033259423503, "grad_norm": 0.37815359234809875, "kl": 1.956748273553244e+17, "learning_rate": 1.960112038634008e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10319 }, { "completion_length": 672.25, "epoch": 2.860310421286031, "grad_norm": 0.0, "kl": 0.15845660865306854, "learning_rate": 1.9596845857436496e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10320 }, { "completion_length": 568.75, "epoch": 2.8605875831485585, "grad_norm": 0.0, "kl": 9.804664611816406, "learning_rate": 1.9592571494225165e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10321 }, { "completion_length": 586.75, "epoch": 2.8608647450110865, "grad_norm": 0.0, "kl": 0.4664618670940399, "learning_rate": 1.9588297296837177e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10322 }, { "completion_length": 598.0, "epoch": 2.8611419068736144, "grad_norm": 0.0, "kl": 0.18161530792713165, "learning_rate": 1.958402326540359e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10323 }, { "completion_length": 574.0, "epoch": 2.861419068736142, "grad_norm": 0.0, "kl": 0.1834140419960022, "learning_rate": 1.957974940005548e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10324 }, { "completion_length": 620.75, "epoch": 2.8616962305986697, "grad_norm": 0.0, "kl": 0.23069661855697632, "learning_rate": 1.9575475700923917e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10325 }, { "completion_length": 556.0, "epoch": 2.861973392461197, "grad_norm": 0.0, "kl": 0.2791122794151306, "learning_rate": 1.9571202168139948e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10326 }, { "completion_length": 496.75, "epoch": 2.862250554323725, "grad_norm": 0.0, "kl": 0.26618677377700806, "learning_rate": 1.956692880183462e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10327 }, { "completion_length": 487.5, "epoch": 2.8625277161862526, "grad_norm": 0.0, "kl": 0.18605995178222656, "learning_rate": 1.956265560213899e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10328 }, { "completion_length": 596.75, "epoch": 2.8628048780487805, "grad_norm": 0.0, "kl": 0.8569542765617371, "learning_rate": 1.9558382569184097e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10329 }, { "completion_length": 597.0, "epoch": 2.8630820399113084, "grad_norm": 0.0, "kl": 0.335360050201416, "learning_rate": 1.9554109703100964e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10330 }, { "completion_length": 548.75, "epoch": 2.863359201773836, "grad_norm": 0.0, "kl": 0.22558017075061798, "learning_rate": 1.954983700402063e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10331 }, { "completion_length": 600.0, "epoch": 2.8636363636363638, "grad_norm": 0.0, "kl": 197858426880.0, "learning_rate": 1.9545564472074138e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10332 }, { "completion_length": 628.0, "epoch": 2.863913525498891, "grad_norm": 0.0, "kl": 0.15947511792182922, "learning_rate": 1.954129210739248e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10333 }, { "completion_length": 571.25, "epoch": 2.864190687361419, "grad_norm": 0.0, "kl": 0.537415623664856, "learning_rate": 1.95370199101067e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10334 }, { "completion_length": 611.25, "epoch": 2.8644678492239466, "grad_norm": 0.0, "kl": 1.7617820501327515, "learning_rate": 1.9532747880347775e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10335 }, { "completion_length": 540.25, "epoch": 2.8647450110864745, "grad_norm": 0.0, "kl": 0.19461829960346222, "learning_rate": 1.952847601824674e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10336 }, { "completion_length": 564.75, "epoch": 2.8650221729490024, "grad_norm": 0.0, "kl": 0.19539493322372437, "learning_rate": 1.952420432393458e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10337 }, { "completion_length": 679.0, "epoch": 2.86529933481153, "grad_norm": 0.0, "kl": 0.16546830534934998, "learning_rate": 1.95199327975423e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10338 }, { "completion_length": 597.75, "epoch": 2.865576496674058, "grad_norm": NaN, "kl": 0.2547377645969391, "learning_rate": 1.9515661439200885e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10339 }, { "completion_length": 550.5, "epoch": 2.8658536585365852, "grad_norm": 0.0, "kl": 0.23823264241218567, "learning_rate": 1.9515661439200885e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10340 }, { "completion_length": 659.75, "epoch": 2.866130820399113, "grad_norm": 0.32077282667160034, "kl": 2.05904185323946e+17, "learning_rate": 1.9511390249041314e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10341 }, { "completion_length": 560.75, "epoch": 2.8664079822616406, "grad_norm": 0.0, "kl": 0.3711238503456116, "learning_rate": 1.950711922719458e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10342 }, { "completion_length": 720.25, "epoch": 2.8666851441241685, "grad_norm": 0.9996623992919922, "kl": 9.691004433910989e+16, "learning_rate": 1.9502848373791643e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10343 }, { "completion_length": 523.75, "epoch": 2.8669623059866964, "grad_norm": 0.0, "kl": 520715962941440.0, "learning_rate": 1.9498577688963483e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10344 }, { "completion_length": 596.0, "epoch": 2.867239467849224, "grad_norm": 0.0, "kl": 1.1847550868988037, "learning_rate": 1.949430717284105e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10345 }, { "completion_length": 571.0, "epoch": 2.867516629711752, "grad_norm": 0.0, "kl": 0.21604420244693756, "learning_rate": 1.949003682555532e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10346 }, { "completion_length": 591.0, "epoch": 2.8677937915742793, "grad_norm": 0.0, "kl": 0.20433734357357025, "learning_rate": 1.9485766647237243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10347 }, { "completion_length": 558.0, "epoch": 2.868070953436807, "grad_norm": 0.0, "kl": 0.19364427030086517, "learning_rate": 1.948149663801776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10348 }, { "completion_length": 522.25, "epoch": 2.8683481152993346, "grad_norm": 0.0, "kl": 0.30817076563835144, "learning_rate": 1.947722679802782e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10349 }, { "completion_length": 535.5, "epoch": 2.8686252771618626, "grad_norm": 0.0, "kl": 0.21395276486873627, "learning_rate": 1.947295712739836e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10350 }, { "completion_length": 543.0, "epoch": 2.8689024390243905, "grad_norm": 0.0, "kl": 0.20945991575717926, "learning_rate": 1.9468687626260314e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10351 }, { "completion_length": 597.25, "epoch": 2.869179600886918, "grad_norm": 0.0, "kl": 0.19913728535175323, "learning_rate": 1.9464418294744606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10352 }, { "completion_length": 606.5, "epoch": 2.869456762749446, "grad_norm": 0.0, "kl": 54272496.0, "learning_rate": 1.946014913298217e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10353 }, { "completion_length": 597.25, "epoch": 2.8697339246119733, "grad_norm": 0.0, "kl": 0.17424780130386353, "learning_rate": 1.9455880141103904e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10354 }, { "completion_length": 525.0, "epoch": 2.870011086474501, "grad_norm": 0.0, "kl": 1.1649160385131836, "learning_rate": 1.9451611319240725e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10355 }, { "completion_length": 616.0, "epoch": 2.8702882483370287, "grad_norm": 0.0, "kl": 0.18254774808883667, "learning_rate": 1.944734266752356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10356 }, { "completion_length": 607.5, "epoch": 2.8705654101995566, "grad_norm": 0.4024444818496704, "kl": 1.7371658371622502e+17, "learning_rate": 1.944307418608329e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10357 }, { "completion_length": 490.75, "epoch": 2.8708425720620845, "grad_norm": 0.0, "kl": 0.23551225662231445, "learning_rate": 1.9438805875050825e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10358 }, { "completion_length": 478.0, "epoch": 2.871119733924612, "grad_norm": 0.0, "kl": 1.8189753293991089, "learning_rate": 1.9434537734557046e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10359 }, { "completion_length": 630.25, "epoch": 2.8713968957871394, "grad_norm": 0.0, "kl": 3.3797447681427, "learning_rate": 1.9430269764732846e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10360 }, { "completion_length": 543.75, "epoch": 2.8716740576496673, "grad_norm": 0.0, "kl": 0.956221878528595, "learning_rate": 1.94260019657091e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10361 }, { "completion_length": 560.75, "epoch": 2.8719512195121952, "grad_norm": 14.418607711791992, "kl": 19047768.0, "learning_rate": 1.9421734337616684e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10362 }, { "completion_length": 558.5, "epoch": 2.8722283813747227, "grad_norm": 0.0, "kl": 0.18780918419361115, "learning_rate": 1.9417466880586476e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10363 }, { "completion_length": 745.75, "epoch": 2.8725055432372506, "grad_norm": 0.0, "kl": 0.20312976837158203, "learning_rate": 1.9413199594749327e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10364 }, { "completion_length": 585.5, "epoch": 2.8727827050997785, "grad_norm": 0.0, "kl": 0.18913553655147552, "learning_rate": 1.940893248023612e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10365 }, { "completion_length": 719.25, "epoch": 2.873059866962306, "grad_norm": 0.0, "kl": 0.16986584663391113, "learning_rate": 1.940466553717768e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10366 }, { "completion_length": 607.25, "epoch": 2.8733370288248334, "grad_norm": 0.0, "kl": 0.3251364529132843, "learning_rate": 1.940039876570489e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10367 }, { "completion_length": 554.5, "epoch": 2.8736141906873613, "grad_norm": 14.271206855773926, "kl": 4.431092739105225, "learning_rate": 1.9396132165948557e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10368 }, { "completion_length": 733.5, "epoch": 2.8738913525498893, "grad_norm": 0.0, "kl": 0.21971526741981506, "learning_rate": 1.939186573803955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10369 }, { "completion_length": 487.75, "epoch": 2.8741685144124167, "grad_norm": 0.0, "kl": 0.1965293288230896, "learning_rate": 1.9387599482108692e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10370 }, { "completion_length": 497.75, "epoch": 2.8744456762749446, "grad_norm": 0.0, "kl": 0.23438069224357605, "learning_rate": 1.9383333398286803e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10371 }, { "completion_length": 580.0, "epoch": 2.8747228381374725, "grad_norm": 0.0, "kl": 0.19677948951721191, "learning_rate": 1.9379067486704723e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10372 }, { "completion_length": 587.0, "epoch": 2.875, "grad_norm": 0.0, "kl": 0.19956867396831512, "learning_rate": 1.9374801747493255e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10373 }, { "completion_length": 503.75, "epoch": 2.8752771618625275, "grad_norm": 0.0, "kl": 0.1879235953092575, "learning_rate": 1.9370536180783223e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10374 }, { "completion_length": 574.0, "epoch": 2.8755543237250554, "grad_norm": 0.0, "kl": 0.2601054608821869, "learning_rate": 1.936627078670542e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10375 }, { "completion_length": 588.75, "epoch": 2.8758314855875833, "grad_norm": 0.7256368398666382, "kl": 3.887097874219008e+16, "learning_rate": 1.936200556539066e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10376 }, { "completion_length": 508.25, "epoch": 2.8761086474501107, "grad_norm": 0.0, "kl": 0.1687164008617401, "learning_rate": 1.9357740516969736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10377 }, { "completion_length": 571.0, "epoch": 2.8763858093126387, "grad_norm": 0.0, "kl": 0.165615051984787, "learning_rate": 1.9353475641573437e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10378 }, { "completion_length": 577.25, "epoch": 2.8766629711751666, "grad_norm": 0.0, "kl": 0.2966717481613159, "learning_rate": 1.934921093933256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10379 }, { "completion_length": 497.75, "epoch": 2.876940133037694, "grad_norm": 0.43819355964660645, "kl": 0.22636649012565613, "learning_rate": 1.9344946410377868e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10380 }, { "completion_length": 563.25, "epoch": 2.8772172949002215, "grad_norm": 0.0, "kl": 0.2018231451511383, "learning_rate": 1.934068205484015e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10381 }, { "completion_length": 553.0, "epoch": 2.8774944567627494, "grad_norm": 0.0, "kl": 0.25100263953208923, "learning_rate": 1.9336417872850165e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10382 }, { "completion_length": 513.75, "epoch": 2.8777716186252773, "grad_norm": 0.0, "kl": 0.2314857393503189, "learning_rate": 1.9332153864538683e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10383 }, { "completion_length": 522.75, "epoch": 2.8780487804878048, "grad_norm": 0.0, "kl": 0.4116750955581665, "learning_rate": 1.9327890030036477e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10384 }, { "completion_length": 513.25, "epoch": 2.8783259423503327, "grad_norm": 0.0, "kl": 2021562186203136.0, "learning_rate": 1.9323626369474274e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10385 }, { "completion_length": 606.75, "epoch": 2.8786031042128606, "grad_norm": 0.0, "kl": 2.8601152896881104, "learning_rate": 1.931936288298285e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10386 }, { "completion_length": 573.25, "epoch": 2.878880266075388, "grad_norm": 0.0, "kl": 153374.078125, "learning_rate": 1.9315099570692924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10387 }, { "completion_length": 459.0, "epoch": 2.8791574279379155, "grad_norm": 0.0, "kl": 0.23741187155246735, "learning_rate": 1.9310836432735252e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10388 }, { "completion_length": 600.5, "epoch": 2.8794345898004434, "grad_norm": 0.0, "kl": 0.26073041558265686, "learning_rate": 1.930657346924056e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10389 }, { "completion_length": 604.25, "epoch": 2.8797117516629713, "grad_norm": 0.0, "kl": 24.57570457458496, "learning_rate": 1.9302310680339573e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10390 }, { "completion_length": 546.25, "epoch": 2.879988913525499, "grad_norm": 0.0, "kl": 0.2038932740688324, "learning_rate": 1.929804806616302e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10391 }, { "completion_length": 576.0, "epoch": 2.8802660753880267, "grad_norm": 0.0, "kl": 1.8898141717725184e+16, "learning_rate": 1.929378562684161e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10392 }, { "completion_length": 546.0, "epoch": 2.880543237250554, "grad_norm": 0.0, "kl": 0.2228245884180069, "learning_rate": 1.928952336250606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10393 }, { "completion_length": 629.0, "epoch": 2.880820399113082, "grad_norm": 1.8102750778198242, "kl": 3916745.0, "learning_rate": 1.928526127328707e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10394 }, { "completion_length": 536.0, "epoch": 2.8810975609756095, "grad_norm": 0.0, "kl": 0.17331087589263916, "learning_rate": 1.9280999359315347e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10395 }, { "completion_length": 707.5, "epoch": 2.8813747228381374, "grad_norm": 0.0, "kl": 0.16100695729255676, "learning_rate": 1.9276737620721576e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10396 }, { "completion_length": 462.75, "epoch": 2.8816518847006654, "grad_norm": 0.0, "kl": 0.24978893995285034, "learning_rate": 1.927247605763647e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10397 }, { "completion_length": 633.0, "epoch": 2.881929046563193, "grad_norm": 0.0, "kl": 0.16619756817817688, "learning_rate": 1.9268214670190675e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10398 }, { "completion_length": 470.25, "epoch": 2.8822062084257207, "grad_norm": 7.080135822296143, "kl": 49738792.0, "learning_rate": 1.9263953458514907e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10399 }, { "completion_length": 587.75, "epoch": 2.882483370288248, "grad_norm": 0.0, "kl": 1.7057098150253296, "learning_rate": 1.9259692422739824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10400 }, { "completion_length": 509.75, "epoch": 2.882760532150776, "grad_norm": 0.0, "kl": 0.20460955798625946, "learning_rate": 1.9255431562996094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10401 }, { "completion_length": 497.5, "epoch": 2.8830376940133036, "grad_norm": 0.0, "kl": 0.20744013786315918, "learning_rate": 1.9251170879414384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10402 }, { "completion_length": 592.5, "epoch": 2.8833148558758315, "grad_norm": 0.0, "kl": 0.22962048649787903, "learning_rate": 1.9246910372125345e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10403 }, { "completion_length": 631.25, "epoch": 2.8835920177383594, "grad_norm": 0.0, "kl": 0.1632947474718094, "learning_rate": 1.9242650041259636e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10404 }, { "completion_length": 523.0, "epoch": 2.883869179600887, "grad_norm": 0.0, "kl": 0.5713151693344116, "learning_rate": 1.9238389886947895e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10405 }, { "completion_length": 495.25, "epoch": 2.8841463414634148, "grad_norm": 0.0, "kl": 0.2166662961244583, "learning_rate": 1.9234129909320763e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10406 }, { "completion_length": 510.75, "epoch": 2.884423503325942, "grad_norm": 0.0, "kl": 0.2051761895418167, "learning_rate": 1.9229870108508892e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10407 }, { "completion_length": 520.5, "epoch": 2.88470066518847, "grad_norm": 0.0, "kl": 0.5457494854927063, "learning_rate": 1.92256104846429e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10408 }, { "completion_length": 553.0, "epoch": 2.8849778270509976, "grad_norm": 0.0, "kl": 0.20936059951782227, "learning_rate": 1.9221351037853416e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10409 }, { "completion_length": 600.25, "epoch": 2.8852549889135255, "grad_norm": 0.3991633355617523, "kl": 0.15824486315250397, "learning_rate": 1.9217091768271052e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10410 }, { "completion_length": 577.0, "epoch": 2.8855321507760534, "grad_norm": 0.0, "kl": 0.20252743363380432, "learning_rate": 1.921283267602643e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10411 }, { "completion_length": 488.25, "epoch": 2.885809312638581, "grad_norm": 0.0, "kl": 0.2150574028491974, "learning_rate": 1.9208573761250154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10412 }, { "completion_length": 510.5, "epoch": 2.886086474501109, "grad_norm": 0.0, "kl": 0.23034581542015076, "learning_rate": 1.9204315024072827e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10413 }, { "completion_length": 578.75, "epoch": 2.8863636363636362, "grad_norm": 0.0, "kl": 0.18190322816371918, "learning_rate": 1.9200056464625056e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10414 }, { "completion_length": 655.75, "epoch": 2.886640798226164, "grad_norm": 0.0, "kl": 0.8481832146644592, "learning_rate": 1.919579808303741e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10415 }, { "completion_length": 506.25, "epoch": 2.8869179600886916, "grad_norm": 0.0, "kl": 0.21697483956813812, "learning_rate": 1.9191539879440515e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10416 }, { "completion_length": 556.5, "epoch": 2.8871951219512195, "grad_norm": 1.9673113822937012, "kl": 372892064.0, "learning_rate": 1.918728185396491e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10417 }, { "completion_length": 475.5, "epoch": 2.8874722838137474, "grad_norm": 0.0, "kl": 0.21699468791484833, "learning_rate": 1.9183024006741206e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10418 }, { "completion_length": 632.5, "epoch": 2.887749445676275, "grad_norm": 0.0, "kl": 0.19317665696144104, "learning_rate": 1.9178766337899943e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10419 }, { "completion_length": 530.25, "epoch": 2.888026607538803, "grad_norm": 0.0, "kl": 0.20985393226146698, "learning_rate": 1.917450884757171e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10420 }, { "completion_length": 549.0, "epoch": 2.8883037694013303, "grad_norm": 0.0, "kl": 0.2014526128768921, "learning_rate": 1.9170251535887054e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10421 }, { "completion_length": 607.5, "epoch": 2.888580931263858, "grad_norm": 0.0, "kl": 0.17874349653720856, "learning_rate": 1.916599440297653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10422 }, { "completion_length": 546.75, "epoch": 2.8888580931263856, "grad_norm": 0.0, "kl": 0.16572795808315277, "learning_rate": 1.9161737448970697e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10423 }, { "completion_length": 594.75, "epoch": 2.8891352549889135, "grad_norm": 0.0, "kl": 0.24160999059677124, "learning_rate": 1.915748067400008e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10424 }, { "completion_length": 612.25, "epoch": 2.8894124168514415, "grad_norm": 0.0, "kl": 0.26161935925483704, "learning_rate": 1.9153224078195236e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10425 }, { "completion_length": 548.0, "epoch": 2.889689578713969, "grad_norm": 0.0, "kl": 0.18874065577983856, "learning_rate": 1.9148967661686675e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10426 }, { "completion_length": 521.0, "epoch": 2.889966740576497, "grad_norm": 0.0, "kl": 0.19703523814678192, "learning_rate": 1.914471142460495e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10427 }, { "completion_length": 484.75, "epoch": 2.8902439024390243, "grad_norm": 0.0, "kl": 0.2039453238248825, "learning_rate": 1.914045536708056e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10428 }, { "completion_length": 572.0, "epoch": 2.890521064301552, "grad_norm": 0.0, "kl": 0.17844553291797638, "learning_rate": 1.9136199489244023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10429 }, { "completion_length": 511.0, "epoch": 2.8907982261640797, "grad_norm": 0.45070821046829224, "kl": 427249631232.0, "learning_rate": 1.9131943791225866e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10430 }, { "completion_length": 609.0, "epoch": 2.8910753880266076, "grad_norm": 0.0, "kl": 0.17541885375976562, "learning_rate": 1.9127688273156577e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10431 }, { "completion_length": 595.75, "epoch": 2.8913525498891355, "grad_norm": 0.0, "kl": 309616.03125, "learning_rate": 1.9123432935166665e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10432 }, { "completion_length": 517.25, "epoch": 2.891629711751663, "grad_norm": 0.0, "kl": 0.17681124806404114, "learning_rate": 1.9119177777386617e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10433 }, { "completion_length": 605.0, "epoch": 2.891906873614191, "grad_norm": 0.0, "kl": 0.2811585068702698, "learning_rate": 1.9114922799946927e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10434 }, { "completion_length": 665.5, "epoch": 2.8921840354767183, "grad_norm": 0.0, "kl": 0.1492803543806076, "learning_rate": 1.9110668002978063e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10435 }, { "completion_length": 542.5, "epoch": 2.8924611973392462, "grad_norm": 0.0, "kl": 0.20653943717479706, "learning_rate": 1.910641338661051e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10436 }, { "completion_length": 504.25, "epoch": 2.8927383592017737, "grad_norm": 0.0, "kl": 0.24274463951587677, "learning_rate": 1.910215895097476e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10437 }, { "completion_length": 595.5, "epoch": 2.8930155210643016, "grad_norm": 0.0, "kl": 0.19439557194709778, "learning_rate": 1.9097904696201242e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10438 }, { "completion_length": 583.0, "epoch": 2.8932926829268295, "grad_norm": 0.0, "kl": 0.1851395070552826, "learning_rate": 1.9093650622420447e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10439 }, { "completion_length": 526.5, "epoch": 2.893569844789357, "grad_norm": 0.0, "kl": 0.2628515064716339, "learning_rate": 1.908939672976281e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10440 }, { "completion_length": 553.0, "epoch": 2.8938470066518844, "grad_norm": 0.0, "kl": 0.20262539386749268, "learning_rate": 1.9085143018358798e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10441 }, { "completion_length": 572.75, "epoch": 2.8941241685144123, "grad_norm": 0.0, "kl": 0.19843798875808716, "learning_rate": 1.9080889488338833e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10442 }, { "completion_length": 528.75, "epoch": 2.8944013303769403, "grad_norm": 0.0, "kl": 0.21536247432231903, "learning_rate": 1.9076636139833366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10443 }, { "completion_length": 549.25, "epoch": 2.8946784922394677, "grad_norm": 0.0, "kl": 0.25723475217819214, "learning_rate": 1.9072382972972838e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10444 }, { "completion_length": 451.75, "epoch": 2.8949556541019956, "grad_norm": 0.0, "kl": 0.2486950308084488, "learning_rate": 1.9068129987887656e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10445 }, { "completion_length": 583.75, "epoch": 2.8952328159645235, "grad_norm": 0.0, "kl": 0.46602579951286316, "learning_rate": 1.906387718470826e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10446 }, { "completion_length": 498.75, "epoch": 2.895509977827051, "grad_norm": 0.0, "kl": 0.22038596868515015, "learning_rate": 1.9059624563565043e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10447 }, { "completion_length": 511.25, "epoch": 2.8957871396895785, "grad_norm": 0.0, "kl": 0.25881674885749817, "learning_rate": 1.9055372124588448e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10448 }, { "completion_length": 538.5, "epoch": 2.8960643015521064, "grad_norm": 0.47758302092552185, "kl": 59326730240.0, "learning_rate": 1.9051119867908842e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10449 }, { "completion_length": 584.5, "epoch": 2.8963414634146343, "grad_norm": 0.0, "kl": 0.1864508092403412, "learning_rate": 1.9046867793656662e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10450 }, { "completion_length": 579.25, "epoch": 2.8966186252771617, "grad_norm": 0.0, "kl": 0.22915174067020416, "learning_rate": 1.9042615901962263e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10451 }, { "completion_length": 546.25, "epoch": 2.8968957871396896, "grad_norm": 0.0, "kl": 0.1796049028635025, "learning_rate": 1.903836419295606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10452 }, { "completion_length": 472.75, "epoch": 2.8971729490022176, "grad_norm": 0.0, "kl": 0.21956440806388855, "learning_rate": 1.9034112666768433e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10453 }, { "completion_length": 562.75, "epoch": 2.897450110864745, "grad_norm": 0.0, "kl": 0.24927757680416107, "learning_rate": 1.9029861323529746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10454 }, { "completion_length": 484.75, "epoch": 2.8977272727272725, "grad_norm": 0.0, "kl": 0.21569982171058655, "learning_rate": 1.9025610163370385e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10455 }, { "completion_length": 573.75, "epoch": 2.8980044345898004, "grad_norm": 0.0, "kl": 0.1780882328748703, "learning_rate": 1.90213591864207e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10456 }, { "completion_length": 606.5, "epoch": 2.8982815964523283, "grad_norm": 0.0, "kl": 0.16292957961559296, "learning_rate": 1.9017108392811065e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10457 }, { "completion_length": 480.75, "epoch": 2.8985587583148558, "grad_norm": 0.0, "kl": 0.22108282148838043, "learning_rate": 1.9012857782671815e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10458 }, { "completion_length": 618.75, "epoch": 2.8988359201773837, "grad_norm": 0.0, "kl": 0.18683336675167084, "learning_rate": 1.9008607356133315e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10459 }, { "completion_length": 470.0, "epoch": 2.8991130820399116, "grad_norm": 0.0, "kl": 0.1918138861656189, "learning_rate": 1.9004357113325914e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10460 }, { "completion_length": 496.5, "epoch": 2.899390243902439, "grad_norm": 0.0, "kl": 0.20766177773475647, "learning_rate": 1.900010705437993e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10461 }, { "completion_length": 495.25, "epoch": 2.8996674057649665, "grad_norm": 0.0, "kl": 0.1733618676662445, "learning_rate": 1.8995857179425708e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10462 }, { "completion_length": 482.25, "epoch": 2.8999445676274944, "grad_norm": 0.0, "kl": 0.2462364286184311, "learning_rate": 1.8991607488593567e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10463 }, { "completion_length": 561.0, "epoch": 2.9002217294900223, "grad_norm": 0.0, "kl": 190.16357421875, "learning_rate": 1.8987357982013835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10464 }, { "completion_length": 485.25, "epoch": 2.90049889135255, "grad_norm": 0.0, "kl": 7276080.5, "learning_rate": 1.8983108659816816e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10465 }, { "completion_length": 551.25, "epoch": 2.9007760532150777, "grad_norm": 0.0, "kl": 0.20570603013038635, "learning_rate": 1.897885952213282e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10466 }, { "completion_length": 635.25, "epoch": 2.901053215077605, "grad_norm": 0.0, "kl": 0.1793954074382782, "learning_rate": 1.897461056909217e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10467 }, { "completion_length": 502.75, "epoch": 2.901330376940133, "grad_norm": 0.0, "kl": 23.27153968811035, "learning_rate": 1.8970361800825132e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10468 }, { "completion_length": 590.5, "epoch": 2.9016075388026605, "grad_norm": 0.0, "kl": 0.2507105767726898, "learning_rate": 1.8966113217462035e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10469 }, { "completion_length": 574.75, "epoch": 2.9018847006651884, "grad_norm": 0.0, "kl": 1151185280.0, "learning_rate": 1.8961864819133124e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10470 }, { "completion_length": 610.75, "epoch": 2.9021618625277164, "grad_norm": 0.0, "kl": 4699244514508800.0, "learning_rate": 1.8957616605968715e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10471 }, { "completion_length": 629.5, "epoch": 2.902439024390244, "grad_norm": 0.0, "kl": 0.1727534830570221, "learning_rate": 1.8953368578099063e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10472 }, { "completion_length": 542.25, "epoch": 2.9027161862527717, "grad_norm": 0.0, "kl": 0.22993898391723633, "learning_rate": 1.8949120735654444e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10473 }, { "completion_length": 632.0, "epoch": 2.902993348115299, "grad_norm": 0.0, "kl": 0.2062227725982666, "learning_rate": 1.8944873078765125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10474 }, { "completion_length": 502.25, "epoch": 2.903270509977827, "grad_norm": 0.0, "kl": 0.17820924520492554, "learning_rate": 1.8940625607561354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10475 }, { "completion_length": 516.75, "epoch": 2.9035476718403546, "grad_norm": 0.0, "kl": 0.1969168782234192, "learning_rate": 1.8936378322173395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10476 }, { "completion_length": 472.5, "epoch": 2.9038248337028825, "grad_norm": 0.0, "kl": 0.2252924144268036, "learning_rate": 1.8932131222731481e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10477 }, { "completion_length": 534.0, "epoch": 2.9041019955654104, "grad_norm": 0.0, "kl": 0.20199444890022278, "learning_rate": 1.8927884309365873e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10478 }, { "completion_length": 573.25, "epoch": 2.904379157427938, "grad_norm": 0.444772332906723, "kl": 3.1414414723147366e+17, "learning_rate": 1.892363758220678e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10479 }, { "completion_length": 534.5, "epoch": 2.9046563192904657, "grad_norm": 0.0, "kl": 0.21492178738117218, "learning_rate": 1.891939104138445e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10480 }, { "completion_length": 482.0, "epoch": 2.904933481152993, "grad_norm": 0.0, "kl": 0.2790886163711548, "learning_rate": 1.8915144687029107e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10481 }, { "completion_length": 536.5, "epoch": 2.905210643015521, "grad_norm": 0.8387181162834167, "kl": 1.359480672443433e+17, "learning_rate": 1.891089851927096e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10482 }, { "completion_length": 543.25, "epoch": 2.9054878048780486, "grad_norm": 0.0, "kl": 0.1846330463886261, "learning_rate": 1.890665253824023e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10483 }, { "completion_length": 546.25, "epoch": 2.9057649667405765, "grad_norm": 0.0, "kl": 0.182949498295784, "learning_rate": 1.8902406744067114e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10484 }, { "completion_length": 490.75, "epoch": 2.9060421286031044, "grad_norm": 0.0, "kl": 0.19262555241584778, "learning_rate": 1.8898161136881823e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10485 }, { "completion_length": 557.25, "epoch": 2.906319290465632, "grad_norm": 0.0, "kl": 0.20580285787582397, "learning_rate": 1.8893915716814542e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10486 }, { "completion_length": 534.0, "epoch": 2.9065964523281598, "grad_norm": 0.0, "kl": 0.19907604157924652, "learning_rate": 1.888967048399547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10487 }, { "completion_length": 541.5, "epoch": 2.9068736141906872, "grad_norm": 0.0, "kl": 0.20946386456489563, "learning_rate": 1.888542543855478e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10488 }, { "completion_length": 570.75, "epoch": 2.907150776053215, "grad_norm": 0.0, "kl": 0.19052578508853912, "learning_rate": 1.8881180580622648e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10489 }, { "completion_length": 537.0, "epoch": 2.9074279379157426, "grad_norm": 0.0, "kl": 0.25627967715263367, "learning_rate": 1.887693591032927e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10490 }, { "completion_length": 602.75, "epoch": 2.9077050997782705, "grad_norm": 2.3810770511627197, "kl": 3584.17431640625, "learning_rate": 1.887269142780479e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10491 }, { "completion_length": 594.0, "epoch": 2.9079822616407984, "grad_norm": 0.0, "kl": 0.2218104600906372, "learning_rate": 1.8868447133179376e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10492 }, { "completion_length": 506.5, "epoch": 2.908259423503326, "grad_norm": 5.245875358581543, "kl": 931.3302612304688, "learning_rate": 1.886420302658318e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10493 }, { "completion_length": 542.0, "epoch": 2.908536585365854, "grad_norm": 0.0, "kl": 0.2469247430562973, "learning_rate": 1.8859959108146359e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10494 }, { "completion_length": 551.5, "epoch": 2.9088137472283813, "grad_norm": 0.0, "kl": 4.23107922797527e+16, "learning_rate": 1.8855715377999042e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10495 }, { "completion_length": 630.75, "epoch": 2.909090909090909, "grad_norm": 0.0, "kl": 0.21083791553974152, "learning_rate": 1.885147183627137e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10496 }, { "completion_length": 727.25, "epoch": 2.9093680709534366, "grad_norm": 0.0, "kl": 0.15167880058288574, "learning_rate": 1.88472284830935e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10497 }, { "completion_length": 656.75, "epoch": 2.9096452328159645, "grad_norm": 0.0, "kl": 0.18910464644432068, "learning_rate": 1.8842985318595516e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10498 }, { "completion_length": 526.5, "epoch": 2.9099223946784925, "grad_norm": 0.0, "kl": 4.083691412258816e+16, "learning_rate": 1.8838742342907578e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10499 }, { "completion_length": 601.75, "epoch": 2.91019955654102, "grad_norm": 0.0, "kl": 0.184085413813591, "learning_rate": 1.8834499556159766e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10500 }, { "completion_length": 510.25, "epoch": 2.910476718403548, "grad_norm": 2.068377733230591, "kl": 15287802920960.0, "learning_rate": 1.8830256958482215e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10501 }, { "completion_length": 503.0, "epoch": 2.9107538802660753, "grad_norm": 0.0, "kl": 0.235530287027359, "learning_rate": 1.882601455000501e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10502 }, { "completion_length": 562.75, "epoch": 2.911031042128603, "grad_norm": 1.842788577079773, "kl": 0.7222853302955627, "learning_rate": 1.8821772330858259e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10503 }, { "completion_length": 596.0, "epoch": 2.9113082039911307, "grad_norm": 0.0, "kl": 0.20002685487270355, "learning_rate": 1.8817530301172054e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10504 }, { "completion_length": 533.75, "epoch": 2.9115853658536586, "grad_norm": 0.0, "kl": 0.1874091923236847, "learning_rate": 1.8813288461076474e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10505 }, { "completion_length": 625.25, "epoch": 2.9118625277161865, "grad_norm": 0.0, "kl": 9025.81640625, "learning_rate": 1.8809046810701604e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10506 }, { "completion_length": 665.0, "epoch": 2.912139689578714, "grad_norm": 0.0, "kl": 0.16514290869235992, "learning_rate": 1.8804805350177507e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10507 }, { "completion_length": 581.0, "epoch": 2.912416851441242, "grad_norm": 4.934631824493408, "kl": 3799253248.0, "learning_rate": 1.8800564079634264e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10508 }, { "completion_length": 536.75, "epoch": 2.9126940133037693, "grad_norm": 0.5201700329780579, "kl": 0.566669225692749, "learning_rate": 1.8796322999201922e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10509 }, { "completion_length": 475.25, "epoch": 2.912971175166297, "grad_norm": 11.791680335998535, "kl": 0.2438320815563202, "learning_rate": 1.8792082109010553e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10510 }, { "completion_length": 601.0, "epoch": 2.9132483370288247, "grad_norm": 0.0, "kl": 1.8673431873321533, "learning_rate": 1.8787841409190208e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10511 }, { "completion_length": 541.75, "epoch": 2.9135254988913526, "grad_norm": 0.0, "kl": 0.22425638139247894, "learning_rate": 1.8783600899870918e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10512 }, { "completion_length": 555.5, "epoch": 2.9138026607538805, "grad_norm": 0.0, "kl": 0.22419139742851257, "learning_rate": 1.8779360581182735e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10513 }, { "completion_length": 519.25, "epoch": 2.914079822616408, "grad_norm": 0.0, "kl": 0.19102230668067932, "learning_rate": 1.877512045325568e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10514 }, { "completion_length": 597.75, "epoch": 2.9143569844789354, "grad_norm": 1.5324828624725342, "kl": 27886219264.0, "learning_rate": 1.8770880516219788e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10515 }, { "completion_length": 539.0, "epoch": 2.9146341463414633, "grad_norm": 0.0, "kl": 0.22421011328697205, "learning_rate": 1.8766640770205077e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10516 }, { "completion_length": 594.0, "epoch": 2.9149113082039912, "grad_norm": 0.0, "kl": 0.19337597489356995, "learning_rate": 1.8762401215341569e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10517 }, { "completion_length": 578.5, "epoch": 2.9151884700665187, "grad_norm": 0.0, "kl": 0.20807303488254547, "learning_rate": 1.875816185175926e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10518 }, { "completion_length": 558.75, "epoch": 2.9154656319290466, "grad_norm": 0.0, "kl": 1.3067457675933838, "learning_rate": 1.8753922679588158e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10519 }, { "completion_length": 644.5, "epoch": 2.9157427937915745, "grad_norm": 0.0, "kl": 0.18155217170715332, "learning_rate": 1.874968369895828e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10520 }, { "completion_length": 590.5, "epoch": 2.916019955654102, "grad_norm": 0.0, "kl": 0.18212027847766876, "learning_rate": 1.8745444909999582e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10521 }, { "completion_length": 541.75, "epoch": 2.9162971175166295, "grad_norm": 0.0, "kl": 0.1965557485818863, "learning_rate": 1.8741206312842085e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10522 }, { "completion_length": 540.75, "epoch": 2.9165742793791574, "grad_norm": 0.5120370388031006, "kl": 2.085049855901696e+16, "learning_rate": 1.873696790761575e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10523 }, { "completion_length": 595.75, "epoch": 2.9168514412416853, "grad_norm": 0.0, "kl": 0.19738830626010895, "learning_rate": 1.8732729694450563e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10524 }, { "completion_length": 548.0, "epoch": 2.9171286031042127, "grad_norm": 0.0, "kl": 0.19031637907028198, "learning_rate": 1.8728491673476476e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10525 }, { "completion_length": 519.75, "epoch": 2.9174057649667406, "grad_norm": 0.0, "kl": 0.1846432089805603, "learning_rate": 1.872425384482346e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10526 }, { "completion_length": 530.25, "epoch": 2.9176829268292686, "grad_norm": 0.0, "kl": 0.22321805357933044, "learning_rate": 1.872001620862148e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10527 }, { "completion_length": 479.25, "epoch": 2.917960088691796, "grad_norm": 0.0, "kl": 0.2869071364402771, "learning_rate": 1.8715778765000468e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10528 }, { "completion_length": 552.0, "epoch": 2.9182372505543235, "grad_norm": 0.0, "kl": 0.20366564393043518, "learning_rate": 1.8711541514090392e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10529 }, { "completion_length": 529.25, "epoch": 2.9185144124168514, "grad_norm": 0.0, "kl": 0.1958017349243164, "learning_rate": 1.8707304456021167e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10530 }, { "completion_length": 565.5, "epoch": 2.9187915742793793, "grad_norm": 0.0, "kl": 3806348194086912.0, "learning_rate": 1.8703067590922753e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10531 }, { "completion_length": 602.0, "epoch": 2.9190687361419068, "grad_norm": 0.0, "kl": 0.19685979187488556, "learning_rate": 1.8698830918925043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10532 }, { "completion_length": 561.75, "epoch": 2.9193458980044347, "grad_norm": 0.0, "kl": 0.19022469222545624, "learning_rate": 1.8694594440157981e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10533 }, { "completion_length": 555.5, "epoch": 2.9196230598669626, "grad_norm": 0.45063677430152893, "kl": 0.22308148443698883, "learning_rate": 1.8690358154751487e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10534 }, { "completion_length": 614.5, "epoch": 2.91990022172949, "grad_norm": 0.0, "kl": 0.19522778689861298, "learning_rate": 1.8686122062835451e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10535 }, { "completion_length": 512.75, "epoch": 2.9201773835920175, "grad_norm": 0.0, "kl": 0.2290586680173874, "learning_rate": 1.8681886164539791e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10536 }, { "completion_length": 572.5, "epoch": 2.9204545454545454, "grad_norm": 0.0, "kl": 0.20380572974681854, "learning_rate": 1.86776504599944e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10537 }, { "completion_length": 539.75, "epoch": 2.9207317073170733, "grad_norm": 0.0, "kl": 0.2165503054857254, "learning_rate": 1.867341494932917e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10538 }, { "completion_length": 519.25, "epoch": 2.921008869179601, "grad_norm": 3.412109851837158, "kl": 45248352.0, "learning_rate": 1.8669179632673985e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10539 }, { "completion_length": 670.5, "epoch": 2.9212860310421287, "grad_norm": 0.0, "kl": 0.24808862805366516, "learning_rate": 1.8664944510158716e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10540 }, { "completion_length": 569.25, "epoch": 2.921563192904656, "grad_norm": 0.0, "kl": 0.1873372197151184, "learning_rate": 1.8660709581913256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10541 }, { "completion_length": 555.0, "epoch": 2.921840354767184, "grad_norm": 1.1689692735671997, "kl": 2234128793600.0, "learning_rate": 1.8656474848067458e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10542 }, { "completion_length": 502.25, "epoch": 2.9221175166297115, "grad_norm": 0.4409213066101074, "kl": 31926542336.0, "learning_rate": 1.8652240308751194e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10543 }, { "completion_length": 595.5, "epoch": 2.9223946784922394, "grad_norm": 0.0, "kl": 0.16447842121124268, "learning_rate": 1.8648005964094307e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10544 }, { "completion_length": 544.0, "epoch": 2.9226718403547673, "grad_norm": 0.0, "kl": 0.20550337433815002, "learning_rate": 1.8643771814226659e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10545 }, { "completion_length": 572.75, "epoch": 2.922949002217295, "grad_norm": 0.0, "kl": 0.2303074151277542, "learning_rate": 1.8639537859278084e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10546 }, { "completion_length": 575.25, "epoch": 2.9232261640798227, "grad_norm": 0.0, "kl": 0.17711026966571808, "learning_rate": 1.8635304099378426e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10547 }, { "completion_length": 621.75, "epoch": 2.92350332594235, "grad_norm": 0.0, "kl": 0.37600284814834595, "learning_rate": 1.8631070534657513e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10548 }, { "completion_length": 611.0, "epoch": 2.923780487804878, "grad_norm": 0.0, "kl": 0.187727689743042, "learning_rate": 1.8626837165245165e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10549 }, { "completion_length": 664.25, "epoch": 2.9240576496674056, "grad_norm": 0.0, "kl": 0.44636455178260803, "learning_rate": 1.8622603991271229e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10550 }, { "completion_length": 515.75, "epoch": 2.9243348115299335, "grad_norm": 0.0, "kl": 0.18828323483467102, "learning_rate": 1.861837101286548e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10551 }, { "completion_length": 539.25, "epoch": 2.9246119733924614, "grad_norm": 0.0, "kl": 0.23474907875061035, "learning_rate": 1.8614138230157757e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10552 }, { "completion_length": 533.25, "epoch": 2.924889135254989, "grad_norm": 0.0, "kl": 0.22562141716480255, "learning_rate": 1.8609905643277845e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10553 }, { "completion_length": 584.0, "epoch": 2.9251662971175167, "grad_norm": 0.5322012305259705, "kl": 4960735232.0, "learning_rate": 1.860567325235555e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10554 }, { "completion_length": 605.75, "epoch": 2.925443458980044, "grad_norm": 0.0, "kl": 0.17718106508255005, "learning_rate": 1.860144105752065e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10555 }, { "completion_length": 630.0, "epoch": 2.925720620842572, "grad_norm": 0.0, "kl": 0.17292474210262299, "learning_rate": 1.8597209058902938e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10556 }, { "completion_length": 574.5, "epoch": 2.9259977827050996, "grad_norm": 0.0, "kl": 0.18822723627090454, "learning_rate": 1.8592977256632192e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10557 }, { "completion_length": 512.25, "epoch": 2.9262749445676275, "grad_norm": 0.0, "kl": 8.263111114501953, "learning_rate": 1.8588745650838176e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10558 }, { "completion_length": 685.0, "epoch": 2.9265521064301554, "grad_norm": 0.0, "kl": 0.15678362548351288, "learning_rate": 1.8584514241650667e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10559 }, { "completion_length": 581.75, "epoch": 2.926829268292683, "grad_norm": 0.0, "kl": 0.19577807188034058, "learning_rate": 1.8580283029199409e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10560 }, { "completion_length": 585.5, "epoch": 2.9271064301552108, "grad_norm": 0.0, "kl": 165946240.0, "learning_rate": 1.8576052013614177e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10561 }, { "completion_length": 543.5, "epoch": 2.9273835920177382, "grad_norm": 0.0, "kl": 0.24901162087917328, "learning_rate": 1.8571821195024693e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10562 }, { "completion_length": 677.25, "epoch": 2.927660753880266, "grad_norm": 0.0, "kl": 0.1612001359462738, "learning_rate": 1.8567590573560717e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10563 }, { "completion_length": 469.5, "epoch": 2.9279379157427936, "grad_norm": 0.0, "kl": 6448991642320896.0, "learning_rate": 1.8563360149351988e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10564 }, { "completion_length": 565.75, "epoch": 2.9282150776053215, "grad_norm": 0.0, "kl": 0.31505367159843445, "learning_rate": 1.8559129922528218e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10565 }, { "completion_length": 512.75, "epoch": 2.9284922394678494, "grad_norm": 0.6283429265022278, "kl": 4430505994878976.0, "learning_rate": 1.855489989321915e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10566 }, { "completion_length": 607.5, "epoch": 2.928769401330377, "grad_norm": 0.0, "kl": 0.18009057641029358, "learning_rate": 1.8550670061554481e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10567 }, { "completion_length": 668.75, "epoch": 2.929046563192905, "grad_norm": 0.0, "kl": 0.1703488677740097, "learning_rate": 1.8546440427663942e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10568 }, { "completion_length": 577.75, "epoch": 2.9293237250554323, "grad_norm": 7.703738212585449, "kl": 1.9570415019989014, "learning_rate": 1.8542210991677222e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10569 }, { "completion_length": 719.75, "epoch": 2.92960088691796, "grad_norm": 20.301036834716797, "kl": 0.9288085103034973, "learning_rate": 1.853798175372402e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10570 }, { "completion_length": 586.0, "epoch": 2.9298780487804876, "grad_norm": 3.1062235832214355, "kl": 0.25948360562324524, "learning_rate": 1.853375271393405e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10571 }, { "completion_length": 564.25, "epoch": 2.9301552106430155, "grad_norm": 0.0, "kl": 37249664155648.0, "learning_rate": 1.852952387243698e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10572 }, { "completion_length": 611.25, "epoch": 2.9304323725055434, "grad_norm": 0.0, "kl": 0.20871351659297943, "learning_rate": 1.85252952293625e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10573 }, { "completion_length": 561.5, "epoch": 2.930709534368071, "grad_norm": 0.0, "kl": 0.18739113211631775, "learning_rate": 1.8521066784840278e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10574 }, { "completion_length": 615.5, "epoch": 2.930986696230599, "grad_norm": 0.0, "kl": 0.22767633199691772, "learning_rate": 1.851683853899999e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10575 }, { "completion_length": 511.5, "epoch": 2.9312638580931263, "grad_norm": 0.0, "kl": 0.24112167954444885, "learning_rate": 1.851261049197129e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10576 }, { "completion_length": 552.5, "epoch": 2.931541019955654, "grad_norm": 0.0, "kl": 0.22390460968017578, "learning_rate": 1.8508382643883837e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10577 }, { "completion_length": 600.0, "epoch": 2.9318181818181817, "grad_norm": 1.2650989294052124, "kl": 71299812556800.0, "learning_rate": 1.850415499486729e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10578 }, { "completion_length": 624.75, "epoch": 2.9320953436807096, "grad_norm": 0.0, "kl": 0.1528356969356537, "learning_rate": 1.8499927545051277e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10579 }, { "completion_length": 666.25, "epoch": 2.9323725055432375, "grad_norm": 0.0, "kl": 0.16436953842639923, "learning_rate": 1.8495700294565466e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10580 }, { "completion_length": 532.25, "epoch": 2.932649667405765, "grad_norm": 0.0, "kl": 0.22495019435882568, "learning_rate": 1.849147324353945e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10581 }, { "completion_length": 629.0, "epoch": 2.932926829268293, "grad_norm": 0.0, "kl": 0.24505075812339783, "learning_rate": 1.8487246392102892e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10582 }, { "completion_length": 605.5, "epoch": 2.9332039911308203, "grad_norm": 0.5520317554473877, "kl": 1185860066213888.0, "learning_rate": 1.8483019740385378e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10583 }, { "completion_length": 563.0, "epoch": 2.933481152993348, "grad_norm": 0.0, "kl": 0.16951854526996613, "learning_rate": 1.847879328851655e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10584 }, { "completion_length": 562.75, "epoch": 2.9337583148558757, "grad_norm": 0.0, "kl": 1.5416455268859863, "learning_rate": 1.8474567036625996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10585 }, { "completion_length": 571.5, "epoch": 2.9340354767184036, "grad_norm": 0.0, "kl": 0.18677015602588654, "learning_rate": 1.8470340984843329e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10586 }, { "completion_length": 535.0, "epoch": 2.9343126385809315, "grad_norm": 0.0, "kl": 0.19487160444259644, "learning_rate": 1.8466115133298145e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10587 }, { "completion_length": 640.75, "epoch": 2.934589800443459, "grad_norm": 0.0, "kl": 0.7842198014259338, "learning_rate": 1.8461889482120028e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10588 }, { "completion_length": 556.75, "epoch": 2.9348669623059864, "grad_norm": 0.0, "kl": 0.22119034826755524, "learning_rate": 1.8457664031438567e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10589 }, { "completion_length": 537.0, "epoch": 2.9351441241685143, "grad_norm": 0.0, "kl": 0.19207951426506042, "learning_rate": 1.8453438781383326e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10590 }, { "completion_length": 591.5, "epoch": 2.9354212860310422, "grad_norm": 0.0, "kl": 0.17405109107494354, "learning_rate": 1.844921373208389e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10591 }, { "completion_length": 639.25, "epoch": 2.9356984478935697, "grad_norm": 2.038952112197876, "kl": 33189610.0, "learning_rate": 1.844498888366981e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10592 }, { "completion_length": 568.0, "epoch": 2.9359756097560976, "grad_norm": 0.0, "kl": 149433320734720.0, "learning_rate": 1.8440764236270659e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10593 }, { "completion_length": 527.0, "epoch": 2.9362527716186255, "grad_norm": 0.0, "kl": 0.2695571482181549, "learning_rate": 1.8436539790015984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10594 }, { "completion_length": 510.5, "epoch": 2.936529933481153, "grad_norm": 2.2898526191711426, "kl": 420067328.0, "learning_rate": 1.8432315545035328e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10595 }, { "completion_length": 645.25, "epoch": 2.9368070953436805, "grad_norm": 0.0, "kl": 0.1807551383972168, "learning_rate": 1.842809150145824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10596 }, { "completion_length": 614.25, "epoch": 2.9370842572062084, "grad_norm": 0.0, "kl": 0.16396431624889374, "learning_rate": 1.842386765941424e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10597 }, { "completion_length": 608.0, "epoch": 2.9373614190687363, "grad_norm": 0.4175374209880829, "kl": 0.2389575093984604, "learning_rate": 1.8419644019032868e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10598 }, { "completion_length": 626.5, "epoch": 2.9376385809312637, "grad_norm": 0.861910343170166, "kl": 0.182547464966774, "learning_rate": 1.8415420580443633e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10599 }, { "completion_length": 590.25, "epoch": 2.9379157427937916, "grad_norm": 0.0, "kl": 0.23446865379810333, "learning_rate": 1.8411197343776054e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10600 }, { "completion_length": 621.5, "epoch": 2.9381929046563195, "grad_norm": 0.0, "kl": 0.17505526542663574, "learning_rate": 1.8406974309159664e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10601 }, { "completion_length": 585.5, "epoch": 2.938470066518847, "grad_norm": 0.0, "kl": 314407.0625, "learning_rate": 1.8402751476723923e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10602 }, { "completion_length": 543.0, "epoch": 2.9387472283813745, "grad_norm": 0.480704128742218, "kl": 2390748160.0, "learning_rate": 1.8398528846598365e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10603 }, { "completion_length": 512.5, "epoch": 2.9390243902439024, "grad_norm": 0.0, "kl": 0.1848895251750946, "learning_rate": 1.8394306418912462e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10604 }, { "completion_length": 515.75, "epoch": 2.9393015521064303, "grad_norm": 0.0, "kl": 0.18081921339035034, "learning_rate": 1.8390084193795703e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10605 }, { "completion_length": 531.5, "epoch": 2.9395787139689578, "grad_norm": 0.0, "kl": 0.19057884812355042, "learning_rate": 1.838586217137756e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10606 }, { "completion_length": 660.25, "epoch": 2.9398558758314857, "grad_norm": 2.1202824115753174, "kl": 4556617216.0, "learning_rate": 1.8381640351787516e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10607 }, { "completion_length": 632.0, "epoch": 2.9401330376940136, "grad_norm": 0.0, "kl": 0.17956876754760742, "learning_rate": 1.8377418735155031e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10608 }, { "completion_length": 600.0, "epoch": 2.940410199556541, "grad_norm": 0.0, "kl": 0.175331711769104, "learning_rate": 1.837319732160956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10609 }, { "completion_length": 697.0, "epoch": 2.9406873614190685, "grad_norm": 0.2803153693675995, "kl": 0.1789756566286087, "learning_rate": 1.8368976111280564e-06, "loss": -0.0, "reward": 5.5625, "reward_std": 0.375, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5625, "step": 10610 }, { "completion_length": 588.5, "epoch": 2.9409645232815964, "grad_norm": 0.8482640385627747, "kl": 10575966208.0, "learning_rate": 1.8364755104297477e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10611 }, { "completion_length": 652.5, "epoch": 2.9412416851441243, "grad_norm": 0.0, "kl": 0.20954154431819916, "learning_rate": 1.8360534300789766e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10612 }, { "completion_length": 563.0, "epoch": 2.941518847006652, "grad_norm": 0.0, "kl": 0.212699756026268, "learning_rate": 1.835631370088683e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10613 }, { "completion_length": 523.0, "epoch": 2.9417960088691797, "grad_norm": 0.0, "kl": 0.2272222489118576, "learning_rate": 1.8352093304718136e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10614 }, { "completion_length": 548.75, "epoch": 2.942073170731707, "grad_norm": 0.0, "kl": 0.21895335614681244, "learning_rate": 1.8347873112413068e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10615 }, { "completion_length": 486.0, "epoch": 2.942350332594235, "grad_norm": 0.0, "kl": 0.20787258446216583, "learning_rate": 1.8343653124101064e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10616 }, { "completion_length": 602.0, "epoch": 2.9426274944567625, "grad_norm": 0.0, "kl": 0.18417690694332123, "learning_rate": 1.8339433339911534e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10617 }, { "completion_length": 547.75, "epoch": 2.9429046563192904, "grad_norm": 0.0, "kl": 0.20046241581439972, "learning_rate": 1.8335213759973868e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10618 }, { "completion_length": 605.0, "epoch": 2.9431818181818183, "grad_norm": 0.0, "kl": 0.21357357501983643, "learning_rate": 1.8330994384417484e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10619 }, { "completion_length": 550.5, "epoch": 2.943458980044346, "grad_norm": 0.0, "kl": 0.18097996711730957, "learning_rate": 1.8326775213371747e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10620 }, { "completion_length": 739.0, "epoch": 2.9437361419068737, "grad_norm": 0.0, "kl": 0.17259657382965088, "learning_rate": 1.832255624696606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10621 }, { "completion_length": 602.75, "epoch": 2.944013303769401, "grad_norm": 0.0, "kl": 0.22785980999469757, "learning_rate": 1.831833748532979e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10622 }, { "completion_length": 582.5, "epoch": 2.944290465631929, "grad_norm": 0.0, "kl": 0.18991820514202118, "learning_rate": 1.8314118928592318e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10623 }, { "completion_length": 622.0, "epoch": 2.9445676274944566, "grad_norm": 0.0, "kl": 0.18038012087345123, "learning_rate": 1.830990057688301e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10624 }, { "completion_length": 463.75, "epoch": 2.9448447893569845, "grad_norm": 0.0, "kl": 0.35457491874694824, "learning_rate": 1.8305682430331217e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10625 }, { "completion_length": 643.0, "epoch": 2.9451219512195124, "grad_norm": 0.0, "kl": 0.19525454938411713, "learning_rate": 1.83014644890663e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10626 }, { "completion_length": 547.5, "epoch": 2.94539911308204, "grad_norm": 0.0, "kl": 11873559773184.0, "learning_rate": 1.8297246753217595e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10627 }, { "completion_length": 611.25, "epoch": 2.9456762749445677, "grad_norm": 0.0, "kl": 0.21109867095947266, "learning_rate": 1.829302922291446e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10628 }, { "completion_length": 626.25, "epoch": 2.945953436807095, "grad_norm": 0.0, "kl": 0.17119884490966797, "learning_rate": 1.828881189828621e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10629 }, { "completion_length": 559.75, "epoch": 2.946230598669623, "grad_norm": 0.0, "kl": 0.2358667105436325, "learning_rate": 1.8284594779462176e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10630 }, { "completion_length": 607.25, "epoch": 2.9465077605321506, "grad_norm": 0.0, "kl": 0.21965305507183075, "learning_rate": 1.82803778665717e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10631 }, { "completion_length": 639.0, "epoch": 2.9467849223946785, "grad_norm": 0.0, "kl": 0.17242591083049774, "learning_rate": 1.8276161159744066e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10632 }, { "completion_length": 531.75, "epoch": 2.9470620842572064, "grad_norm": 0.0, "kl": 0.2147301882505417, "learning_rate": 1.8271944659108615e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10633 }, { "completion_length": 516.5, "epoch": 2.947339246119734, "grad_norm": 0.0, "kl": 0.2494613230228424, "learning_rate": 1.826772836479462e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10634 }, { "completion_length": 534.5, "epoch": 2.9476164079822618, "grad_norm": 0.0, "kl": 0.24419325590133667, "learning_rate": 1.8263512276931398e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10635 }, { "completion_length": 698.0, "epoch": 2.9478935698447892, "grad_norm": 0.0, "kl": 0.154917910695076, "learning_rate": 1.8259296395648228e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10636 }, { "completion_length": 626.0, "epoch": 2.948170731707317, "grad_norm": 0.47914570569992065, "kl": 0.1819416731595993, "learning_rate": 1.8255080721074391e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10637 }, { "completion_length": 685.75, "epoch": 2.9484478935698446, "grad_norm": 0.0, "kl": 0.18153104186058044, "learning_rate": 1.8250865253339184e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10638 }, { "completion_length": 705.75, "epoch": 2.9487250554323725, "grad_norm": 0.0, "kl": 0.1561422049999237, "learning_rate": 1.8246649992571853e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10639 }, { "completion_length": 623.75, "epoch": 2.9490022172949004, "grad_norm": 0.358245849609375, "kl": 0.1860034316778183, "learning_rate": 1.8242434938901683e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10640 }, { "completion_length": 572.25, "epoch": 2.949279379157428, "grad_norm": 0.5010083913803101, "kl": 1996633927581696.0, "learning_rate": 1.8238220092457909e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10641 }, { "completion_length": 568.25, "epoch": 2.949556541019956, "grad_norm": 0.0, "kl": 0.29465875029563904, "learning_rate": 1.8234005453369812e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10642 }, { "completion_length": 545.25, "epoch": 2.9498337028824833, "grad_norm": 1.2515816688537598, "kl": 10134849847296.0, "learning_rate": 1.8229791021766607e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10643 }, { "completion_length": 531.25, "epoch": 2.950110864745011, "grad_norm": 0.0, "kl": 0.3592436909675598, "learning_rate": 1.8225576797777565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10644 }, { "completion_length": 658.0, "epoch": 2.9503880266075386, "grad_norm": 0.0, "kl": 0.17805270850658417, "learning_rate": 1.8221362781531884e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10645 }, { "completion_length": 657.75, "epoch": 2.9506651884700665, "grad_norm": 2.8426973819732666, "kl": 0.17661182582378387, "learning_rate": 1.8217148973158817e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10646 }, { "completion_length": 634.0, "epoch": 2.9509423503325944, "grad_norm": 0.0, "kl": 0.18879054486751556, "learning_rate": 1.8212935372787575e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10647 }, { "completion_length": 588.25, "epoch": 2.951219512195122, "grad_norm": 0.0, "kl": 0.20475181937217712, "learning_rate": 1.8208721980547367e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10648 }, { "completion_length": 610.75, "epoch": 2.95149667405765, "grad_norm": 2.141728162765503, "kl": 2602557.5, "learning_rate": 1.8204508796567413e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10649 }, { "completion_length": 717.5, "epoch": 2.9517738359201773, "grad_norm": 0.0, "kl": 0.1807662695646286, "learning_rate": 1.8200295820976899e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10650 }, { "completion_length": 511.0, "epoch": 2.952050997782705, "grad_norm": 0.0, "kl": 0.24339623749256134, "learning_rate": 1.8196083053905034e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10651 }, { "completion_length": 576.75, "epoch": 2.9523281596452327, "grad_norm": 0.0, "kl": 0.17987366020679474, "learning_rate": 1.819187049548099e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10652 }, { "completion_length": 654.25, "epoch": 2.9526053215077606, "grad_norm": 0.0, "kl": 0.1624917834997177, "learning_rate": 1.818765814583395e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10653 }, { "completion_length": 547.25, "epoch": 2.9528824833702885, "grad_norm": 0.0, "kl": 0.18645745515823364, "learning_rate": 1.8183446005093108e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10654 }, { "completion_length": 627.5, "epoch": 2.953159645232816, "grad_norm": 0.0, "kl": 0.1804208904504776, "learning_rate": 1.817923407338762e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10655 }, { "completion_length": 623.5, "epoch": 2.953436807095344, "grad_norm": 0.0, "kl": 0.23924171924591064, "learning_rate": 1.8175022350846649e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10656 }, { "completion_length": 663.5, "epoch": 2.9537139689578713, "grad_norm": 0.0, "kl": 0.17990179359912872, "learning_rate": 1.8170810837599345e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10657 }, { "completion_length": 557.5, "epoch": 2.953991130820399, "grad_norm": 0.0, "kl": 0.19603244960308075, "learning_rate": 1.8166599533774876e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10658 }, { "completion_length": 572.25, "epoch": 2.9542682926829267, "grad_norm": 0.0, "kl": 0.21969805657863617, "learning_rate": 1.8162388439502363e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10659 }, { "completion_length": 605.75, "epoch": 2.9545454545454546, "grad_norm": 0.0, "kl": 0.24382954835891724, "learning_rate": 1.8158177554910953e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10660 }, { "completion_length": 561.25, "epoch": 2.9548226164079825, "grad_norm": 0.0, "kl": 0.18410803377628326, "learning_rate": 1.815396688012978e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10661 }, { "completion_length": 634.0, "epoch": 2.95509977827051, "grad_norm": 3.133624315261841, "kl": 0.17531824111938477, "learning_rate": 1.8149756415287955e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10662 }, { "completion_length": 611.75, "epoch": 2.9553769401330374, "grad_norm": 0.0, "kl": 0.17470906674861908, "learning_rate": 1.8145546160514622e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10663 }, { "completion_length": 499.5, "epoch": 2.9556541019955653, "grad_norm": 0.0, "kl": 0.18450967967510223, "learning_rate": 1.8141336115938857e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10664 }, { "completion_length": 569.0, "epoch": 2.9559312638580932, "grad_norm": 0.39623314142227173, "kl": 0.23957213759422302, "learning_rate": 1.8137126281689792e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10665 }, { "completion_length": 644.5, "epoch": 2.9562084257206207, "grad_norm": 0.33957046270370483, "kl": 0.16296078264713287, "learning_rate": 1.8132916657896513e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10666 }, { "completion_length": 605.75, "epoch": 2.9564855875831486, "grad_norm": 0.0, "kl": 0.17549733817577362, "learning_rate": 1.8128707244688109e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10667 }, { "completion_length": 615.75, "epoch": 2.9567627494456765, "grad_norm": 0.0, "kl": 0.16835874319076538, "learning_rate": 1.8124498042193678e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10668 }, { "completion_length": 551.25, "epoch": 2.957039911308204, "grad_norm": 0.0, "kl": 0.18874695897102356, "learning_rate": 1.8120289050542286e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10669 }, { "completion_length": 658.0, "epoch": 2.9573170731707314, "grad_norm": 0.0, "kl": 0.19933246076107025, "learning_rate": 1.8116080269863015e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10670 }, { "completion_length": 614.75, "epoch": 2.9575942350332594, "grad_norm": 0.0, "kl": 0.4598158299922943, "learning_rate": 1.811187170028492e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10671 }, { "completion_length": 638.75, "epoch": 2.9578713968957873, "grad_norm": 0.0, "kl": 0.2113964557647705, "learning_rate": 1.810766334193707e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10672 }, { "completion_length": 663.75, "epoch": 2.9581485587583147, "grad_norm": 0.0, "kl": 0.4386354982852936, "learning_rate": 1.8103455194948505e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10673 }, { "completion_length": 557.75, "epoch": 2.9584257206208426, "grad_norm": 7.665321350097656, "kl": 3098.5673828125, "learning_rate": 1.8099247259448282e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10674 }, { "completion_length": 595.25, "epoch": 2.9587028824833705, "grad_norm": 0.0, "kl": 0.2441299855709076, "learning_rate": 1.8095039535565452e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10675 }, { "completion_length": 610.5, "epoch": 2.958980044345898, "grad_norm": 1.5451470613479614, "kl": 49258.05078125, "learning_rate": 1.8090832023429022e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10676 }, { "completion_length": 584.75, "epoch": 2.9592572062084255, "grad_norm": 0.0, "kl": 0.1855815351009369, "learning_rate": 1.8086624723168043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10677 }, { "completion_length": 580.0, "epoch": 2.9595343680709534, "grad_norm": 0.0, "kl": 0.1896803230047226, "learning_rate": 1.8082417634911517e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10678 }, { "completion_length": 521.75, "epoch": 2.9598115299334813, "grad_norm": 0.0, "kl": 1.2478976249694824, "learning_rate": 1.8078210758788473e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10679 }, { "completion_length": 639.5, "epoch": 2.9600886917960088, "grad_norm": 0.0, "kl": 0.44654393196105957, "learning_rate": 1.8074004094927906e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10680 }, { "completion_length": 573.0, "epoch": 2.9603658536585367, "grad_norm": 0.0, "kl": 0.2029602825641632, "learning_rate": 1.8069797643458822e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10681 }, { "completion_length": 544.0, "epoch": 2.9606430155210646, "grad_norm": 0.0, "kl": 0.2068122774362564, "learning_rate": 1.8065591404510213e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10682 }, { "completion_length": 651.5, "epoch": 2.960920177383592, "grad_norm": 0.3340677320957184, "kl": 3637258496.0, "learning_rate": 1.8061385378211066e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10683 }, { "completion_length": 628.5, "epoch": 2.9611973392461195, "grad_norm": 0.0, "kl": 0.3672821521759033, "learning_rate": 1.805717956469038e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10684 }, { "completion_length": 573.75, "epoch": 2.9614745011086474, "grad_norm": 0.0, "kl": 0.19095145165920258, "learning_rate": 1.80529739640771e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10685 }, { "completion_length": 557.25, "epoch": 2.9617516629711753, "grad_norm": 0.0, "kl": 0.21285830438137054, "learning_rate": 1.804876857650022e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10686 }, { "completion_length": 644.25, "epoch": 2.962028824833703, "grad_norm": 0.3436112403869629, "kl": 0.16560639441013336, "learning_rate": 1.8044563402088686e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10687 }, { "completion_length": 588.25, "epoch": 2.9623059866962307, "grad_norm": 0.0, "kl": 0.18475161492824554, "learning_rate": 1.8040358440971464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10688 }, { "completion_length": 472.0, "epoch": 2.962583148558758, "grad_norm": 0.0, "kl": 0.18856176733970642, "learning_rate": 1.8036153693277492e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10689 }, { "completion_length": 554.0, "epoch": 2.962860310421286, "grad_norm": 0.0, "kl": 0.27313774824142456, "learning_rate": 1.8031949159135717e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10690 }, { "completion_length": 616.75, "epoch": 2.9631374722838135, "grad_norm": 0.0, "kl": 0.20265375077724457, "learning_rate": 1.8027744838675078e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10691 }, { "completion_length": 658.5, "epoch": 2.9634146341463414, "grad_norm": 0.0, "kl": 0.3083241581916809, "learning_rate": 1.8023540732024494e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10692 }, { "completion_length": 585.0, "epoch": 2.9636917960088693, "grad_norm": 0.0, "kl": 0.16794060170650482, "learning_rate": 1.8019336839312909e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10693 }, { "completion_length": 624.25, "epoch": 2.963968957871397, "grad_norm": 0.0, "kl": 0.19581764936447144, "learning_rate": 1.8015133160669207e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10694 }, { "completion_length": 638.75, "epoch": 2.9642461197339247, "grad_norm": 0.0, "kl": 0.15409845113754272, "learning_rate": 1.8010929696222334e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10695 }, { "completion_length": 572.0, "epoch": 2.964523281596452, "grad_norm": 0.0, "kl": 0.21480333805084229, "learning_rate": 1.8006726446101153e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10696 }, { "completion_length": 566.25, "epoch": 2.96480044345898, "grad_norm": 0.5388047099113464, "kl": 313649247813632.0, "learning_rate": 1.800252341043459e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10697 }, { "completion_length": 631.5, "epoch": 2.9650776053215075, "grad_norm": 0.0, "kl": 0.1573335826396942, "learning_rate": 1.799832058935153e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10698 }, { "completion_length": 691.5, "epoch": 2.9653547671840355, "grad_norm": 0.420040100812912, "kl": 1379067693629440.0, "learning_rate": 1.7994117982980843e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10699 }, { "completion_length": 588.5, "epoch": 2.9656319290465634, "grad_norm": 0.4213002920150757, "kl": 0.31027770042419434, "learning_rate": 1.798991559145142e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10700 }, { "completion_length": 589.75, "epoch": 2.965909090909091, "grad_norm": 0.35687652230262756, "kl": 10594140160.0, "learning_rate": 1.798571341489212e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10701 }, { "completion_length": 511.5, "epoch": 2.9661862527716187, "grad_norm": 0.0, "kl": 0.21278949081897736, "learning_rate": 1.7981511453431817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10702 }, { "completion_length": 543.5, "epoch": 2.966463414634146, "grad_norm": 2.6965720653533936, "kl": 48438435840.0, "learning_rate": 1.7977309707199355e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10703 }, { "completion_length": 595.0, "epoch": 2.966740576496674, "grad_norm": 0.0, "kl": 2.0411267280578613, "learning_rate": 1.7973108176323587e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10704 }, { "completion_length": 576.25, "epoch": 2.9670177383592016, "grad_norm": 0.0, "kl": 0.2714299261569977, "learning_rate": 1.7968906860933369e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10705 }, { "completion_length": 691.75, "epoch": 2.9672949002217295, "grad_norm": 0.0, "kl": 0.15565337240695953, "learning_rate": 1.7964705761157524e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10706 }, { "completion_length": 623.5, "epoch": 2.9675720620842574, "grad_norm": 0.0, "kl": 0.18812353909015656, "learning_rate": 1.796050487712489e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10707 }, { "completion_length": 732.5, "epoch": 2.967849223946785, "grad_norm": 1.864336609840393, "kl": 13544895545344.0, "learning_rate": 1.7956304208964288e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10708 }, { "completion_length": 489.5, "epoch": 2.9681263858093128, "grad_norm": 0.0, "kl": 0.1770230382680893, "learning_rate": 1.7952103756804538e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10709 }, { "completion_length": 606.0, "epoch": 2.9684035476718402, "grad_norm": 0.3398863971233368, "kl": 0.2027563899755478, "learning_rate": 1.794790352077444e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10710 }, { "completion_length": 639.0, "epoch": 2.968680709534368, "grad_norm": 0.0, "kl": 781.0299072265625, "learning_rate": 1.7943703501002812e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10711 }, { "completion_length": 572.5, "epoch": 2.9689578713968956, "grad_norm": 0.0, "kl": 0.19114193320274353, "learning_rate": 1.7939503697618438e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10712 }, { "completion_length": 612.25, "epoch": 2.9692350332594235, "grad_norm": 0.0, "kl": 0.1861056536436081, "learning_rate": 1.793530411075011e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10713 }, { "completion_length": 532.75, "epoch": 2.9695121951219514, "grad_norm": 0.0, "kl": 0.2125341296195984, "learning_rate": 1.7931104740526633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10714 }, { "completion_length": 620.0, "epoch": 2.969789356984479, "grad_norm": 0.0, "kl": 0.19387727975845337, "learning_rate": 1.792690558707675e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10715 }, { "completion_length": 542.25, "epoch": 2.970066518847007, "grad_norm": 0.0, "kl": 0.1802661418914795, "learning_rate": 1.792270665052926e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10716 }, { "completion_length": 643.75, "epoch": 2.9703436807095343, "grad_norm": 0.0, "kl": 0.15723662078380585, "learning_rate": 1.7918507931012912e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10717 }, { "completion_length": 665.25, "epoch": 2.970620842572062, "grad_norm": 9.673745155334473, "kl": 1031955.6875, "learning_rate": 1.7914309428656473e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10718 }, { "completion_length": 608.75, "epoch": 2.9708980044345896, "grad_norm": 0.0, "kl": 0.18039368093013763, "learning_rate": 1.7910111143588681e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10719 }, { "completion_length": 650.75, "epoch": 2.9711751662971175, "grad_norm": 0.41200634837150574, "kl": 1021592320.0, "learning_rate": 1.7905913075938291e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10720 }, { "completion_length": 569.25, "epoch": 2.9714523281596454, "grad_norm": 0.6601055264472961, "kl": 0.20743617415428162, "learning_rate": 1.7901715225834037e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10721 }, { "completion_length": 621.5, "epoch": 2.971729490022173, "grad_norm": 1.8649603128433228, "kl": 70264496.0, "learning_rate": 1.7897517593404645e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10722 }, { "completion_length": 581.5, "epoch": 2.972006651884701, "grad_norm": 0.0, "kl": 0.16483661532402039, "learning_rate": 1.789332017877885e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10723 }, { "completion_length": 680.5, "epoch": 2.9722838137472283, "grad_norm": 2.157864570617676, "kl": 26123892.0, "learning_rate": 1.788912298208535e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10724 }, { "completion_length": 597.75, "epoch": 2.972560975609756, "grad_norm": 0.0, "kl": 0.18194256722927094, "learning_rate": 1.7884926003452885e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10725 }, { "completion_length": 645.25, "epoch": 2.9728381374722836, "grad_norm": 0.0, "kl": 0.19448408484458923, "learning_rate": 1.7880729243010125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10726 }, { "completion_length": 559.0, "epoch": 2.9731152993348116, "grad_norm": 0.0, "kl": 0.16859322786331177, "learning_rate": 1.7876532700885788e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10727 }, { "completion_length": 648.5, "epoch": 2.9733924611973395, "grad_norm": 0.0, "kl": 0.19805948436260223, "learning_rate": 1.7872336377208565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10728 }, { "completion_length": 634.25, "epoch": 2.973669623059867, "grad_norm": 0.0, "kl": 0.3287193179130554, "learning_rate": 1.7868140272107132e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10729 }, { "completion_length": 596.25, "epoch": 2.973946784922395, "grad_norm": 0.0, "kl": 0.18053483963012695, "learning_rate": 1.786394438571017e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10730 }, { "completion_length": 579.5, "epoch": 2.9742239467849223, "grad_norm": 0.0, "kl": 7837934682112.0, "learning_rate": 1.7859748718146347e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10731 }, { "completion_length": 635.25, "epoch": 2.97450110864745, "grad_norm": 0.42882874608039856, "kl": 1459763072.0, "learning_rate": 1.7855553269544332e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10732 }, { "completion_length": 595.0, "epoch": 2.9747782705099777, "grad_norm": 0.0, "kl": 0.17950338125228882, "learning_rate": 1.7851358040032773e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10733 }, { "completion_length": 571.5, "epoch": 2.9750554323725056, "grad_norm": 0.0, "kl": 5557501923688448.0, "learning_rate": 1.7847163029740323e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10734 }, { "completion_length": 625.5, "epoch": 2.9753325942350335, "grad_norm": 0.0, "kl": 0.22671526670455933, "learning_rate": 1.784296823879564e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10735 }, { "completion_length": 672.25, "epoch": 2.975609756097561, "grad_norm": 0.3865506947040558, "kl": 20383610880.0, "learning_rate": 1.783877366732733e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10736 }, { "completion_length": 593.5, "epoch": 2.9758869179600884, "grad_norm": 0.0, "kl": 590419394560.0, "learning_rate": 1.7834579315464056e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10737 }, { "completion_length": 652.25, "epoch": 2.9761640798226163, "grad_norm": 0.0, "kl": 0.16734574735164642, "learning_rate": 1.7830385183334422e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10738 }, { "completion_length": 641.0, "epoch": 2.9764412416851442, "grad_norm": 0.0, "kl": 0.19760654866695404, "learning_rate": 1.7826191271067055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10739 }, { "completion_length": 549.25, "epoch": 2.9767184035476717, "grad_norm": 0.0, "kl": 0.2931853234767914, "learning_rate": 1.7821997578790554e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10740 }, { "completion_length": 630.0, "epoch": 2.9769955654101996, "grad_norm": 0.3789203464984894, "kl": 1208563840.0, "learning_rate": 1.7817804106633531e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10741 }, { "completion_length": 614.0, "epoch": 2.9772727272727275, "grad_norm": 0.0, "kl": 165.48690795898438, "learning_rate": 1.7813610854724578e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10742 }, { "completion_length": 658.0, "epoch": 2.977549889135255, "grad_norm": 0.0, "kl": 0.9130613803863525, "learning_rate": 1.7809417823192276e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10743 }, { "completion_length": 582.5, "epoch": 2.9778270509977824, "grad_norm": 0.0, "kl": 0.17042216658592224, "learning_rate": 1.7805225012165234e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10744 }, { "completion_length": 568.75, "epoch": 2.9781042128603104, "grad_norm": 0.0, "kl": 0.1850493997335434, "learning_rate": 1.7801032421771997e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10745 }, { "completion_length": 613.5, "epoch": 2.9783813747228383, "grad_norm": 4.22800350189209, "kl": 9398.3818359375, "learning_rate": 1.7796840052141161e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10746 }, { "completion_length": 652.25, "epoch": 2.9786585365853657, "grad_norm": 0.0, "kl": 0.14861370623111725, "learning_rate": 1.779264790340126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10747 }, { "completion_length": 601.5, "epoch": 2.9789356984478936, "grad_norm": 0.0, "kl": 0.7575594782829285, "learning_rate": 1.778845597568088e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10748 }, { "completion_length": 599.25, "epoch": 2.9792128603104215, "grad_norm": 0.0, "kl": 0.2002057135105133, "learning_rate": 1.7784264269108551e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10749 }, { "completion_length": 648.25, "epoch": 2.979490022172949, "grad_norm": 0.0, "kl": 0.16313429176807404, "learning_rate": 1.7780072783812818e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10750 }, { "completion_length": 648.25, "epoch": 2.9797671840354765, "grad_norm": 0.0, "kl": 331664.90625, "learning_rate": 1.7775881519922225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10751 }, { "completion_length": 570.25, "epoch": 2.9800443458980044, "grad_norm": 0.0, "kl": 0.24832890927791595, "learning_rate": 1.777169047756529e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10752 }, { "completion_length": 632.5, "epoch": 2.9803215077605323, "grad_norm": 0.0, "kl": 0.1689043641090393, "learning_rate": 1.776749965687054e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10753 }, { "completion_length": 736.0, "epoch": 2.9805986696230597, "grad_norm": 0.3470369279384613, "kl": 0.1655803620815277, "learning_rate": 1.7763309057966487e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10754 }, { "completion_length": 586.5, "epoch": 2.9808758314855877, "grad_norm": 0.0, "kl": 0.18776002526283264, "learning_rate": 1.7759118680981646e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10755 }, { "completion_length": 664.75, "epoch": 2.9811529933481156, "grad_norm": 0.0, "kl": 0.203237384557724, "learning_rate": 1.7754928526044506e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10756 }, { "completion_length": 630.75, "epoch": 2.981430155210643, "grad_norm": 0.0, "kl": 0.22643446922302246, "learning_rate": 1.7750738593283573e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10757 }, { "completion_length": 592.5, "epoch": 2.9817073170731705, "grad_norm": 0.0, "kl": 0.19654089212417603, "learning_rate": 1.7746548882827336e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10758 }, { "completion_length": 633.25, "epoch": 2.9819844789356984, "grad_norm": 0.0, "kl": 0.18190661072731018, "learning_rate": 1.7742359394804264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10759 }, { "completion_length": 662.0, "epoch": 2.9822616407982263, "grad_norm": 0.0, "kl": 0.16037598252296448, "learning_rate": 1.7738170129342847e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10760 }, { "completion_length": 711.25, "epoch": 2.9825388026607538, "grad_norm": 0.0, "kl": 0.17172272503376007, "learning_rate": 1.7733981086571533e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10761 }, { "completion_length": 716.0, "epoch": 2.9828159645232817, "grad_norm": 0.5790085196495056, "kl": 3607627563008.0, "learning_rate": 1.7729792266618803e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10762 }, { "completion_length": 823.5, "epoch": 2.983093126385809, "grad_norm": 0.0, "kl": 0.13897362351417542, "learning_rate": 1.7725603669613095e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10763 }, { "completion_length": 602.75, "epoch": 2.983370288248337, "grad_norm": 0.0, "kl": 0.17448711395263672, "learning_rate": 1.7721415295682853e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10764 }, { "completion_length": 518.5, "epoch": 2.9836474501108645, "grad_norm": 0.0, "kl": 0.15904192626476288, "learning_rate": 1.7717227144956548e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10765 }, { "completion_length": 615.0, "epoch": 2.9839246119733924, "grad_norm": 0.0, "kl": 0.2070714682340622, "learning_rate": 1.7713039217562573e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10766 }, { "completion_length": 631.0, "epoch": 2.9842017738359203, "grad_norm": 0.0, "kl": 0.18568724393844604, "learning_rate": 1.7708851513629376e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10767 }, { "completion_length": 583.5, "epoch": 2.984478935698448, "grad_norm": 0.0, "kl": 0.1962045431137085, "learning_rate": 1.7704664033285374e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10768 }, { "completion_length": 576.0, "epoch": 2.9847560975609757, "grad_norm": 0.0, "kl": 0.18263545632362366, "learning_rate": 1.7700476776658982e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10769 }, { "completion_length": 725.5, "epoch": 2.985033259423503, "grad_norm": 0.0, "kl": 0.17185766994953156, "learning_rate": 1.7696289743878598e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10770 }, { "completion_length": 565.5, "epoch": 2.985310421286031, "grad_norm": 0.0, "kl": 0.21182754635810852, "learning_rate": 1.769210293507263e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10771 }, { "completion_length": 596.0, "epoch": 2.9855875831485585, "grad_norm": 1.7836679220199585, "kl": 2374800128.0, "learning_rate": 1.7687916350369458e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10772 }, { "completion_length": 649.75, "epoch": 2.9858647450110865, "grad_norm": 0.0, "kl": 0.19369906187057495, "learning_rate": 1.7683729989897474e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10773 }, { "completion_length": 675.25, "epoch": 2.9861419068736144, "grad_norm": 0.0, "kl": 0.5374467968940735, "learning_rate": 1.7679543853785065e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10774 }, { "completion_length": 483.0, "epoch": 2.986419068736142, "grad_norm": 0.0, "kl": 0.2021133005619049, "learning_rate": 1.7675357942160578e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10775 }, { "completion_length": 637.0, "epoch": 2.9866962305986697, "grad_norm": 0.0, "kl": 0.16671252250671387, "learning_rate": 1.7671172255152414e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10776 }, { "completion_length": 657.25, "epoch": 2.986973392461197, "grad_norm": 0.0, "kl": 0.16462968289852142, "learning_rate": 1.766698679288889e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10777 }, { "completion_length": 666.25, "epoch": 2.987250554323725, "grad_norm": 0.0, "kl": 0.16993378102779388, "learning_rate": 1.7662801555498394e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10778 }, { "completion_length": 512.75, "epoch": 2.9875277161862526, "grad_norm": 3.230041027069092, "kl": 677514240.0, "learning_rate": 1.7658616543109237e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10779 }, { "completion_length": 618.25, "epoch": 2.9878048780487805, "grad_norm": 0.0, "kl": 0.18597553670406342, "learning_rate": 1.7654431755849771e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10780 }, { "completion_length": 630.0, "epoch": 2.9880820399113084, "grad_norm": 0.0, "kl": 0.23333442211151123, "learning_rate": 1.7650247193848338e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10781 }, { "completion_length": 557.75, "epoch": 2.988359201773836, "grad_norm": 1.9900851249694824, "kl": 65626660864.0, "learning_rate": 1.7646062857233243e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10782 }, { "completion_length": 680.0, "epoch": 2.9886363636363638, "grad_norm": 0.0, "kl": 0.1521991342306137, "learning_rate": 1.7641878746132813e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10783 }, { "completion_length": 506.0, "epoch": 2.988913525498891, "grad_norm": 0.0, "kl": 0.3313372731208801, "learning_rate": 1.7637694860675348e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10784 }, { "completion_length": 515.0, "epoch": 2.989190687361419, "grad_norm": 0.0, "kl": 0.2059704065322876, "learning_rate": 1.763351120098916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10785 }, { "completion_length": 543.0, "epoch": 2.9894678492239466, "grad_norm": 2.4302420616149902, "kl": 307135862800384.0, "learning_rate": 1.7629327767202528e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10786 }, { "completion_length": 588.75, "epoch": 2.9897450110864745, "grad_norm": 0.0, "kl": 0.2269393503665924, "learning_rate": 1.7625144559443758e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10787 }, { "completion_length": 574.5, "epoch": 2.9900221729490024, "grad_norm": 0.0, "kl": 0.2658025920391083, "learning_rate": 1.7620961577841134e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10788 }, { "completion_length": 609.0, "epoch": 2.99029933481153, "grad_norm": 0.0, "kl": 0.24078233540058136, "learning_rate": 1.7616778822522911e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10789 }, { "completion_length": 727.5, "epoch": 2.990576496674058, "grad_norm": 0.29873111844062805, "kl": 0.17262984812259674, "learning_rate": 1.7612596293617379e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10790 }, { "completion_length": 773.0, "epoch": 2.9908536585365852, "grad_norm": 0.27185991406440735, "kl": 0.15639296174049377, "learning_rate": 1.7608413991252782e-06, "loss": -0.0, "reward": 5.59375, "reward_std": 0.3125, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 10791 }, { "completion_length": 542.25, "epoch": 2.991130820399113, "grad_norm": 0.0, "kl": 0.20933161675930023, "learning_rate": 1.7604231915557382e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10792 }, { "completion_length": 633.0, "epoch": 2.9914079822616406, "grad_norm": 0.0, "kl": 2.8742258548736572, "learning_rate": 1.7600050066659418e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10793 }, { "completion_length": 590.5, "epoch": 2.9916851441241685, "grad_norm": 0.0, "kl": 0.21429245173931122, "learning_rate": 1.7595868444687133e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10794 }, { "completion_length": 588.75, "epoch": 2.9919623059866964, "grad_norm": 0.0, "kl": 0.18973618745803833, "learning_rate": 1.7591687049768776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10795 }, { "completion_length": 585.0, "epoch": 2.992239467849224, "grad_norm": 0.0, "kl": 0.21488362550735474, "learning_rate": 1.7587505882032546e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10796 }, { "completion_length": 650.5, "epoch": 2.992516629711752, "grad_norm": 3.1382205486297607, "kl": 6931677184000.0, "learning_rate": 1.758332494160669e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10797 }, { "completion_length": 555.75, "epoch": 2.9927937915742793, "grad_norm": 0.40986621379852295, "kl": 0.17623935639858246, "learning_rate": 1.7579144228619388e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10798 }, { "completion_length": 654.5, "epoch": 2.993070953436807, "grad_norm": 0.0, "kl": 0.1593872457742691, "learning_rate": 1.7574963743198876e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10799 }, { "completion_length": 513.75, "epoch": 2.9933481152993346, "grad_norm": 0.0, "kl": 0.21787631511688232, "learning_rate": 1.757078348547333e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10800 }, { "completion_length": 523.75, "epoch": 2.9936252771618626, "grad_norm": 0.0, "kl": 0.24951902031898499, "learning_rate": 1.7566603455570951e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10801 }, { "completion_length": 568.0, "epoch": 2.9939024390243905, "grad_norm": 0.0, "kl": 0.17067836225032806, "learning_rate": 1.7562423653619931e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10802 }, { "completion_length": 585.5, "epoch": 2.994179600886918, "grad_norm": 0.0, "kl": 0.17918455600738525, "learning_rate": 1.7558244079748427e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10803 }, { "completion_length": 550.75, "epoch": 2.994456762749446, "grad_norm": 0.0, "kl": 0.19927151501178741, "learning_rate": 1.755406473408463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10804 }, { "completion_length": 660.0, "epoch": 2.9947339246119733, "grad_norm": 0.0, "kl": 0.1833309680223465, "learning_rate": 1.7549885616756685e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10805 }, { "completion_length": 541.75, "epoch": 2.995011086474501, "grad_norm": 0.0, "kl": 0.21454742550849915, "learning_rate": 1.7545706727892763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10806 }, { "completion_length": 660.5, "epoch": 2.9952882483370287, "grad_norm": 0.0, "kl": 0.186985045671463, "learning_rate": 1.7541528067620995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10807 }, { "completion_length": 598.25, "epoch": 2.9955654101995566, "grad_norm": 0.0, "kl": 0.16871757805347443, "learning_rate": 1.7537349636069551e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10808 }, { "completion_length": 515.0, "epoch": 2.9958425720620845, "grad_norm": 0.0, "kl": 0.23638732731342316, "learning_rate": 1.7533171433366533e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10809 }, { "completion_length": 571.75, "epoch": 2.996119733924612, "grad_norm": 0.0, "kl": 0.17181870341300964, "learning_rate": 1.7528993459640093e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10810 }, { "completion_length": 565.5, "epoch": 2.9963968957871394, "grad_norm": 0.0, "kl": 0.2055709809064865, "learning_rate": 1.7524815715018352e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10811 }, { "completion_length": 636.0, "epoch": 2.9966740576496673, "grad_norm": 1.1697115898132324, "kl": 26258089967616.0, "learning_rate": 1.7520638199629408e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10812 }, { "completion_length": 614.5, "epoch": 2.9969512195121952, "grad_norm": 0.0, "kl": 0.18309977650642395, "learning_rate": 1.7516460913601382e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10813 }, { "completion_length": 709.25, "epoch": 2.9972283813747227, "grad_norm": 0.0, "kl": 0.19603271782398224, "learning_rate": 1.7512283857062364e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10814 }, { "completion_length": 524.25, "epoch": 2.9975055432372506, "grad_norm": 0.0, "kl": 0.2822953462600708, "learning_rate": 1.750810703014046e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10815 }, { "completion_length": 491.75, "epoch": 2.9977827050997785, "grad_norm": 0.0, "kl": 0.2161291539669037, "learning_rate": 1.750393043296374e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10816 }, { "completion_length": 729.0, "epoch": 2.998059866962306, "grad_norm": 0.0, "kl": 0.18625827133655548, "learning_rate": 1.7499754065660288e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10817 }, { "completion_length": 609.5, "epoch": 2.9983370288248334, "grad_norm": 0.0, "kl": 0.1866089552640915, "learning_rate": 1.7495577928358188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10818 }, { "completion_length": 582.0, "epoch": 2.9986141906873613, "grad_norm": 0.0, "kl": 6.81777811050415, "learning_rate": 1.7491402021185489e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10819 }, { "completion_length": 561.25, "epoch": 2.9988913525498893, "grad_norm": 2.7018258571624756, "kl": 12673250304.0, "learning_rate": 1.7487226344270262e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10820 }, { "completion_length": 563.5, "epoch": 2.9991685144124167, "grad_norm": 0.0, "kl": 0.2396044284105301, "learning_rate": 1.7483050897740545e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10821 }, { "completion_length": 539.25, "epoch": 2.9994456762749446, "grad_norm": 0.0, "kl": 17528975785984.0, "learning_rate": 1.7478875681724392e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10822 }, { "completion_length": 658.5, "epoch": 2.9997228381374725, "grad_norm": 0.0, "kl": 0.16758498549461365, "learning_rate": 1.7474700696349829e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10823 }, { "completion_length": 533.5, "epoch": 3.0, "grad_norm": 0.0, "kl": 0.24981778860092163, "learning_rate": 1.7470525941744892e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10824 }, { "completion_length": 644.0, "epoch": 3.000277161862528, "grad_norm": 0.0, "kl": 0.1733572781085968, "learning_rate": 1.7466351418037608e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10825 }, { "completion_length": 574.75, "epoch": 3.0005543237250554, "grad_norm": 0.0, "kl": 0.1865147203207016, "learning_rate": 1.7462177125355973e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10826 }, { "completion_length": 531.0, "epoch": 3.0008314855875833, "grad_norm": 0.0, "kl": 0.16815876960754395, "learning_rate": 1.7458003063828027e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10827 }, { "completion_length": 532.0, "epoch": 3.0011086474501107, "grad_norm": 0.0, "kl": 0.294018030166626, "learning_rate": 1.745382923358174e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10828 }, { "completion_length": 570.75, "epoch": 3.0013858093126387, "grad_norm": 0.0, "kl": 2250281005875200.0, "learning_rate": 1.7449655634745133e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10829 }, { "completion_length": 699.75, "epoch": 3.001662971175166, "grad_norm": 4.722599983215332, "kl": 1038601.4375, "learning_rate": 1.7445482267446163e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10830 }, { "completion_length": 565.0, "epoch": 3.001940133037694, "grad_norm": 0.0, "kl": 0.17354144155979156, "learning_rate": 1.7441309131812828e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10831 }, { "completion_length": 603.0, "epoch": 3.002217294900222, "grad_norm": 0.6105034947395325, "kl": 139147813584896.0, "learning_rate": 1.7437136227973108e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10832 }, { "completion_length": 596.25, "epoch": 3.0024944567627494, "grad_norm": 0.36937415599823, "kl": 1109566213324800.0, "learning_rate": 1.7432963556054955e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10833 }, { "completion_length": 622.75, "epoch": 3.0027716186252773, "grad_norm": 0.0, "kl": 0.18940891325473785, "learning_rate": 1.7428791116186333e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10834 }, { "completion_length": 627.25, "epoch": 3.0030487804878048, "grad_norm": 0.0, "kl": 1.680635690689087, "learning_rate": 1.7424618908495189e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10835 }, { "completion_length": 673.0, "epoch": 3.0033259423503327, "grad_norm": 0.0, "kl": 0.1934100240468979, "learning_rate": 1.7420446933109475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10836 }, { "completion_length": 584.0, "epoch": 3.00360310421286, "grad_norm": 0.6928917765617371, "kl": 2301624655544320.0, "learning_rate": 1.7416275190157111e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10837 }, { "completion_length": 587.25, "epoch": 3.003880266075388, "grad_norm": 0.0, "kl": 0.16610825061798096, "learning_rate": 1.7412103679766057e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10838 }, { "completion_length": 533.75, "epoch": 3.004157427937916, "grad_norm": 0.0, "kl": 0.23905131220817566, "learning_rate": 1.7407932402064204e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10839 }, { "completion_length": 545.0, "epoch": 3.0044345898004434, "grad_norm": 0.0, "kl": 0.1847614198923111, "learning_rate": 1.7403761357179485e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10840 }, { "completion_length": 588.25, "epoch": 3.0047117516629713, "grad_norm": 0.0, "kl": 0.19880783557891846, "learning_rate": 1.7399590545239813e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10841 }, { "completion_length": 637.0, "epoch": 3.004988913525499, "grad_norm": 0.0, "kl": 0.557056188583374, "learning_rate": 1.7395419966373075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10842 }, { "completion_length": 627.5, "epoch": 3.0052660753880267, "grad_norm": 0.4440484344959259, "kl": 4899221949906944.0, "learning_rate": 1.7391249620707179e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10843 }, { "completion_length": 557.25, "epoch": 3.005543237250554, "grad_norm": 0.0, "kl": 0.3680342137813568, "learning_rate": 1.7387079508369996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10844 }, { "completion_length": 552.0, "epoch": 3.005820399113082, "grad_norm": 0.0, "kl": 7218433.5, "learning_rate": 1.7382909629489424e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10845 }, { "completion_length": 592.5, "epoch": 3.0060975609756095, "grad_norm": 0.0, "kl": 3226921135505408.0, "learning_rate": 1.7378739984193326e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10846 }, { "completion_length": 583.5, "epoch": 3.0063747228381374, "grad_norm": 0.0, "kl": 0.15680457651615143, "learning_rate": 1.7374570572609559e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10847 }, { "completion_length": 496.75, "epoch": 3.0066518847006654, "grad_norm": 0.0, "kl": 0.2151905745267868, "learning_rate": 1.737040139486601e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10848 }, { "completion_length": 616.5, "epoch": 3.006929046563193, "grad_norm": 0.0, "kl": 1.918781042098999, "learning_rate": 1.7366232451090493e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10849 }, { "completion_length": 548.5, "epoch": 3.0072062084257207, "grad_norm": 0.0, "kl": 1.3071327209472656, "learning_rate": 1.7362063741410884e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10850 }, { "completion_length": 606.25, "epoch": 3.007483370288248, "grad_norm": 0.0, "kl": 1.438941240310669, "learning_rate": 1.7357895265954998e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10851 }, { "completion_length": 596.0, "epoch": 3.007760532150776, "grad_norm": 2.68272066116333, "kl": 0.18185773491859436, "learning_rate": 1.7353727024850683e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10852 }, { "completion_length": 586.25, "epoch": 3.0080376940133036, "grad_norm": 0.0, "kl": 0.16697633266448975, "learning_rate": 1.7349559018225748e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10853 }, { "completion_length": 535.5, "epoch": 3.0083148558758315, "grad_norm": 0.0, "kl": 0.7384041547775269, "learning_rate": 1.734539124620801e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10854 }, { "completion_length": 621.75, "epoch": 3.0085920177383594, "grad_norm": 0.0, "kl": 0.19127461314201355, "learning_rate": 1.7341223708925287e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10855 }, { "completion_length": 680.5, "epoch": 3.008869179600887, "grad_norm": 0.0, "kl": 3664075008.0, "learning_rate": 1.7337056406505359e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10856 }, { "completion_length": 561.0, "epoch": 3.0091463414634148, "grad_norm": 0.0, "kl": 0.19340473413467407, "learning_rate": 1.7332889339076053e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10857 }, { "completion_length": 543.0, "epoch": 3.009423503325942, "grad_norm": 0.0, "kl": 0.1882767677307129, "learning_rate": 1.7328722506765122e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10858 }, { "completion_length": 685.5, "epoch": 3.00970066518847, "grad_norm": 0.0, "kl": 12.40407657623291, "learning_rate": 1.7324555909700374e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10859 }, { "completion_length": 693.25, "epoch": 3.0099778270509976, "grad_norm": 0.0, "kl": 0.16899165511131287, "learning_rate": 1.732038954800955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10860 }, { "completion_length": 744.5, "epoch": 3.0102549889135255, "grad_norm": 0.3142794966697693, "kl": 1.5564149618148804, "learning_rate": 1.731622342182044e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10861 }, { "completion_length": 608.25, "epoch": 3.0105321507760534, "grad_norm": 0.0, "kl": 0.18023858964443207, "learning_rate": 1.7312057531260798e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10862 }, { "completion_length": 561.0, "epoch": 3.010809312638581, "grad_norm": 0.0, "kl": 1.1462522745132446, "learning_rate": 1.7307891876458366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10863 }, { "completion_length": 673.25, "epoch": 3.011086474501109, "grad_norm": 0.0, "kl": 0.18347586691379547, "learning_rate": 1.7303726457540895e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10864 }, { "completion_length": 493.25, "epoch": 3.0113636363636362, "grad_norm": 0.0, "kl": 13.703927040100098, "learning_rate": 1.7299561274636116e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10865 }, { "completion_length": 646.75, "epoch": 3.011640798226164, "grad_norm": 0.0, "kl": 0.1875467747449875, "learning_rate": 1.729539632787176e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10866 }, { "completion_length": 595.5, "epoch": 3.0119179600886916, "grad_norm": 0.0, "kl": 0.3436359465122223, "learning_rate": 1.7291231617375547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10867 }, { "completion_length": 594.0, "epoch": 3.0121951219512195, "grad_norm": 0.0, "kl": 0.197566419839859, "learning_rate": 1.72870671432752e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10868 }, { "completion_length": 588.5, "epoch": 3.0124722838137474, "grad_norm": 0.0, "kl": 0.17195814847946167, "learning_rate": 1.7282902905698401e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10869 }, { "completion_length": 502.25, "epoch": 3.012749445676275, "grad_norm": 0.0, "kl": 0.1896771937608719, "learning_rate": 1.727873890477288e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10870 }, { "completion_length": 617.5, "epoch": 3.013026607538803, "grad_norm": 0.0, "kl": 0.17815828323364258, "learning_rate": 1.7274575140626318e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10871 }, { "completion_length": 495.0, "epoch": 3.0133037694013303, "grad_norm": 0.0, "kl": 0.34882763028144836, "learning_rate": 1.7270411613386396e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10872 }, { "completion_length": 644.5, "epoch": 3.013580931263858, "grad_norm": 0.0, "kl": 0.18474888801574707, "learning_rate": 1.7266248323180801e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10873 }, { "completion_length": 518.5, "epoch": 3.0138580931263856, "grad_norm": 5.070231914520264, "kl": 770918383616.0, "learning_rate": 1.7262085270137192e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10874 }, { "completion_length": 661.25, "epoch": 3.0141352549889135, "grad_norm": 0.0, "kl": 0.19518084824085236, "learning_rate": 1.725792245438324e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10875 }, { "completion_length": 552.75, "epoch": 3.0144124168514415, "grad_norm": 0.0, "kl": 0.17645852267742157, "learning_rate": 1.7253759876046602e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10876 }, { "completion_length": 576.0, "epoch": 3.014689578713969, "grad_norm": 0.5069136023521423, "kl": 0.19022023677825928, "learning_rate": 1.7249597535254916e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10877 }, { "completion_length": 526.5, "epoch": 3.014966740576497, "grad_norm": 0.0, "kl": 0.16124340891838074, "learning_rate": 1.7245435432135851e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10878 }, { "completion_length": 639.0, "epoch": 3.0152439024390243, "grad_norm": 0.4388270080089569, "kl": 10640882688.0, "learning_rate": 1.7241273566817007e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10879 }, { "completion_length": 650.5, "epoch": 3.015521064301552, "grad_norm": 0.0, "kl": 0.2161949872970581, "learning_rate": 1.7237111939426034e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10880 }, { "completion_length": 604.75, "epoch": 3.0157982261640797, "grad_norm": 0.0, "kl": 0.24225173890590668, "learning_rate": 1.7232950550090544e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10881 }, { "completion_length": 607.25, "epoch": 3.0160753880266076, "grad_norm": 0.0, "kl": 0.18124504387378693, "learning_rate": 1.7228789398938154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10882 }, { "completion_length": 536.75, "epoch": 3.016352549889135, "grad_norm": 0.0, "kl": 93796614144.0, "learning_rate": 1.7224628486096462e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10883 }, { "completion_length": 586.5, "epoch": 3.016629711751663, "grad_norm": 0.0, "kl": 0.16097025573253632, "learning_rate": 1.7220467811693064e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10884 }, { "completion_length": 690.25, "epoch": 3.016906873614191, "grad_norm": 0.0, "kl": 0.18000352382659912, "learning_rate": 1.7216307375855568e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10885 }, { "completion_length": 626.5, "epoch": 3.0171840354767183, "grad_norm": 0.0, "kl": 0.16376566886901855, "learning_rate": 1.7212147178711535e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10886 }, { "completion_length": 593.5, "epoch": 3.0174611973392462, "grad_norm": 0.0, "kl": 0.19094330072402954, "learning_rate": 1.720798722038856e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10887 }, { "completion_length": 640.5, "epoch": 3.0177383592017737, "grad_norm": 0.3452216386795044, "kl": 0.7352176308631897, "learning_rate": 1.7203827501014193e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10888 }, { "completion_length": 587.0, "epoch": 3.0180155210643016, "grad_norm": 0.0, "kl": 0.16923539340496063, "learning_rate": 1.7199668020716021e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10889 }, { "completion_length": 707.5, "epoch": 3.018292682926829, "grad_norm": 0.0, "kl": 0.1342620998620987, "learning_rate": 1.7195508779621564e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10890 }, { "completion_length": 689.75, "epoch": 3.018569844789357, "grad_norm": 0.0, "kl": 0.15256717801094055, "learning_rate": 1.7191349777858396e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10891 }, { "completion_length": 694.75, "epoch": 3.018847006651885, "grad_norm": 0.3181244730949402, "kl": 0.16525857150554657, "learning_rate": 1.7187191015554047e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10892 }, { "completion_length": 697.25, "epoch": 3.0191241685144123, "grad_norm": 0.0, "kl": 0.1868075132369995, "learning_rate": 1.718303249283605e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10893 }, { "completion_length": 555.5, "epoch": 3.0194013303769403, "grad_norm": 0.0, "kl": 0.2433050125837326, "learning_rate": 1.7178874209831929e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10894 }, { "completion_length": 625.25, "epoch": 3.0196784922394677, "grad_norm": 0.0, "kl": 0.17167016863822937, "learning_rate": 1.7174716166669197e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10895 }, { "completion_length": 623.5, "epoch": 3.0199556541019956, "grad_norm": 0.0, "kl": 0.19723685085773468, "learning_rate": 1.7170558363475371e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10896 }, { "completion_length": 631.5, "epoch": 3.020232815964523, "grad_norm": 0.0, "kl": 0.16381999850273132, "learning_rate": 1.7166400800377947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10897 }, { "completion_length": 606.25, "epoch": 3.020509977827051, "grad_norm": 0.0, "kl": 0.2215452343225479, "learning_rate": 1.716224347750442e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10898 }, { "completion_length": 672.0, "epoch": 3.020787139689579, "grad_norm": 2.104956865310669, "kl": 885283088236544.0, "learning_rate": 1.7158086394982295e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10899 }, { "completion_length": 623.25, "epoch": 3.0210643015521064, "grad_norm": 0.0, "kl": 0.9850960969924927, "learning_rate": 1.7153929552939025e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10900 }, { "completion_length": 699.75, "epoch": 3.0213414634146343, "grad_norm": 0.0, "kl": 1.8756240606307983, "learning_rate": 1.7149772951502105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10901 }, { "completion_length": 550.75, "epoch": 3.0216186252771617, "grad_norm": 0.0, "kl": 0.1971965730190277, "learning_rate": 1.714561659079899e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10902 }, { "completion_length": 595.75, "epoch": 3.0218957871396896, "grad_norm": 0.0, "kl": 0.16175426542758942, "learning_rate": 1.7141460470957142e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10903 }, { "completion_length": 535.0, "epoch": 3.022172949002217, "grad_norm": 0.4216882288455963, "kl": 0.174615278840065, "learning_rate": 1.713730459210401e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10904 }, { "completion_length": 567.75, "epoch": 3.022450110864745, "grad_norm": 0.0, "kl": 0.23262840509414673, "learning_rate": 1.7133148954367038e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10905 }, { "completion_length": 648.75, "epoch": 3.022727272727273, "grad_norm": 0.0, "kl": 0.18366435170173645, "learning_rate": 1.712899355787366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10906 }, { "completion_length": 563.25, "epoch": 3.0230044345898004, "grad_norm": 1.8446718454360962, "kl": 3215.819580078125, "learning_rate": 1.7124838402751304e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10907 }, { "completion_length": 554.0, "epoch": 3.0232815964523283, "grad_norm": 0.0, "kl": 2394421450506240.0, "learning_rate": 1.7120683489127405e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10908 }, { "completion_length": 568.0, "epoch": 3.0235587583148558, "grad_norm": 0.0, "kl": 0.17305126786231995, "learning_rate": 1.7116528817129353e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10909 }, { "completion_length": 606.25, "epoch": 3.0238359201773837, "grad_norm": 0.0, "kl": 0.218869149684906, "learning_rate": 1.7112374386884583e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10910 }, { "completion_length": 558.5, "epoch": 3.024113082039911, "grad_norm": 0.41598260402679443, "kl": 0.257365345954895, "learning_rate": 1.7108220198520465e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10911 }, { "completion_length": 634.75, "epoch": 3.024390243902439, "grad_norm": 0.0, "kl": 0.21950596570968628, "learning_rate": 1.7104066252164414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10912 }, { "completion_length": 484.0, "epoch": 3.024667405764967, "grad_norm": 0.0, "kl": 0.21914252638816833, "learning_rate": 1.7099912547943799e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10913 }, { "completion_length": 581.75, "epoch": 3.0249445676274944, "grad_norm": 0.0, "kl": 1087648235520.0, "learning_rate": 1.7095759085986003e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10914 }, { "completion_length": 607.5, "epoch": 3.0252217294900223, "grad_norm": 0.0, "kl": 0.18404637277126312, "learning_rate": 1.70916058664184e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10915 }, { "completion_length": 654.5, "epoch": 3.02549889135255, "grad_norm": 0.0, "kl": 0.1774405986070633, "learning_rate": 1.7087452889368341e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10916 }, { "completion_length": 571.5, "epoch": 3.0257760532150777, "grad_norm": 0.0, "kl": 0.19091388583183289, "learning_rate": 1.7083300154963195e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10917 }, { "completion_length": 652.25, "epoch": 3.026053215077605, "grad_norm": 0.0, "kl": 0.16052399575710297, "learning_rate": 1.7079147663330292e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10918 }, { "completion_length": 598.25, "epoch": 3.026330376940133, "grad_norm": 0.0, "kl": 0.23107299208641052, "learning_rate": 1.7074995414596984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10919 }, { "completion_length": 623.0, "epoch": 3.0266075388026605, "grad_norm": 0.0, "kl": 11500382060544.0, "learning_rate": 1.7070843408890592e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10920 }, { "completion_length": 712.5, "epoch": 3.0268847006651884, "grad_norm": 0.34782296419143677, "kl": 0.15010784566402435, "learning_rate": 1.7066691646338454e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10921 }, { "completion_length": 597.75, "epoch": 3.0271618625277164, "grad_norm": 0.0, "kl": 0.17153537273406982, "learning_rate": 1.7062540127067886e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10922 }, { "completion_length": 586.0, "epoch": 3.027439024390244, "grad_norm": 0.38137581944465637, "kl": 18091343872.0, "learning_rate": 1.7058388851206187e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10923 }, { "completion_length": 609.0, "epoch": 3.0277161862527717, "grad_norm": 0.0, "kl": 18.370107650756836, "learning_rate": 1.705423781888067e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10924 }, { "completion_length": 602.75, "epoch": 3.027993348115299, "grad_norm": 0.0, "kl": 0.15183310210704803, "learning_rate": 1.7050087030218622e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10925 }, { "completion_length": 704.75, "epoch": 3.028270509977827, "grad_norm": 0.0, "kl": 0.1638893485069275, "learning_rate": 1.7045936485347342e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10926 }, { "completion_length": 682.25, "epoch": 3.0285476718403546, "grad_norm": 0.0, "kl": 0.15965422987937927, "learning_rate": 1.7041786184394094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10927 }, { "completion_length": 550.5, "epoch": 3.0288248337028825, "grad_norm": 6.5183186531066895, "kl": 685407926747136.0, "learning_rate": 1.7037636127486152e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10928 }, { "completion_length": 548.0, "epoch": 3.0291019955654104, "grad_norm": 0.0, "kl": 0.2392667680978775, "learning_rate": 1.7033486314750802e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10929 }, { "completion_length": 626.25, "epoch": 3.029379157427938, "grad_norm": 0.0, "kl": 0.4140738248825073, "learning_rate": 1.7029336746315272e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10930 }, { "completion_length": 558.5, "epoch": 3.0296563192904657, "grad_norm": 0.0, "kl": 0.22682161629199982, "learning_rate": 1.7025187422306841e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10931 }, { "completion_length": 595.0, "epoch": 3.029933481152993, "grad_norm": 0.0, "kl": 1.6084978941034496e+16, "learning_rate": 1.702103834285273e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10932 }, { "completion_length": 494.25, "epoch": 3.030210643015521, "grad_norm": 0.4654051661491394, "kl": 2.485601727152128e+16, "learning_rate": 1.7016889508080187e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10933 }, { "completion_length": 687.75, "epoch": 3.0304878048780486, "grad_norm": 0.0, "kl": 0.16578583419322968, "learning_rate": 1.7012740918116427e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10934 }, { "completion_length": 616.5, "epoch": 3.0307649667405765, "grad_norm": 0.0, "kl": 17349804.0, "learning_rate": 1.7008592573088684e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10935 }, { "completion_length": 567.75, "epoch": 3.0310421286031044, "grad_norm": 0.0, "kl": 0.18658296763896942, "learning_rate": 1.7004444473124154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10936 }, { "completion_length": 568.5, "epoch": 3.031319290465632, "grad_norm": 0.0, "kl": 0.22208628058433533, "learning_rate": 1.7000296618350054e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10937 }, { "completion_length": 662.25, "epoch": 3.0315964523281598, "grad_norm": 0.0, "kl": 0.18100404739379883, "learning_rate": 1.6996149008893586e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10938 }, { "completion_length": 551.5, "epoch": 3.0318736141906872, "grad_norm": 0.0, "kl": 0.21875496208667755, "learning_rate": 1.699200164488192e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10939 }, { "completion_length": 593.5, "epoch": 3.032150776053215, "grad_norm": 0.0, "kl": 0.1540866494178772, "learning_rate": 1.6987854526442265e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10940 }, { "completion_length": 681.75, "epoch": 3.0324279379157426, "grad_norm": 0.0, "kl": 0.18141713738441467, "learning_rate": 1.698370765370177e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10941 }, { "completion_length": 506.25, "epoch": 3.0327050997782705, "grad_norm": 0.0, "kl": 0.21520476043224335, "learning_rate": 1.6979561026787627e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10942 }, { "completion_length": 594.25, "epoch": 3.0329822616407984, "grad_norm": 0.0, "kl": 0.1949734389781952, "learning_rate": 1.6975414645826972e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10943 }, { "completion_length": 578.0, "epoch": 3.033259423503326, "grad_norm": 0.0, "kl": 0.16383393108844757, "learning_rate": 1.697126851094697e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10944 }, { "completion_length": 637.25, "epoch": 3.033536585365854, "grad_norm": 2.334404945373535, "kl": 0.5889330506324768, "learning_rate": 1.696712262227477e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10945 }, { "completion_length": 602.25, "epoch": 3.0338137472283813, "grad_norm": 0.0, "kl": 0.1910959780216217, "learning_rate": 1.6962976979937498e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10946 }, { "completion_length": 587.75, "epoch": 3.034090909090909, "grad_norm": 0.0, "kl": 0.18412557244300842, "learning_rate": 1.6958831584062294e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10947 }, { "completion_length": 617.0, "epoch": 3.0343680709534366, "grad_norm": 0.0, "kl": 0.19218367338180542, "learning_rate": 1.6954686434776268e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10948 }, { "completion_length": 639.0, "epoch": 3.0346452328159645, "grad_norm": 0.0, "kl": 0.20849643647670746, "learning_rate": 1.695054153220655e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10949 }, { "completion_length": 618.75, "epoch": 3.0349223946784925, "grad_norm": 0.0, "kl": 0.30293112993240356, "learning_rate": 1.6946396876480225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10950 }, { "completion_length": 695.25, "epoch": 3.03519955654102, "grad_norm": 0.0, "kl": 0.16107246279716492, "learning_rate": 1.6942252467724412e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10951 }, { "completion_length": 623.25, "epoch": 3.035476718403548, "grad_norm": 2.198895215988159, "kl": 230990495744.0, "learning_rate": 1.6938108306066203e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10952 }, { "completion_length": 644.75, "epoch": 3.0357538802660753, "grad_norm": 0.0, "kl": 0.23448818922042847, "learning_rate": 1.6933964391632667e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10953 }, { "completion_length": 546.75, "epoch": 3.036031042128603, "grad_norm": 0.0, "kl": 0.30872607231140137, "learning_rate": 1.69298207245509e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10954 }, { "completion_length": 576.25, "epoch": 3.0363082039911307, "grad_norm": 0.0, "kl": 0.22679294645786285, "learning_rate": 1.6925677304947949e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10955 }, { "completion_length": 606.5, "epoch": 3.0365853658536586, "grad_norm": 0.0, "kl": 0.20696519315242767, "learning_rate": 1.6921534132950895e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10956 }, { "completion_length": 557.0, "epoch": 3.0368625277161865, "grad_norm": 2.008849859237671, "kl": 29618550784.0, "learning_rate": 1.6917391208686779e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10957 }, { "completion_length": 586.5, "epoch": 3.037139689578714, "grad_norm": 0.0, "kl": 0.20114457607269287, "learning_rate": 1.6913248532282644e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10958 }, { "completion_length": 649.25, "epoch": 3.037416851441242, "grad_norm": 0.0, "kl": 0.1590787172317505, "learning_rate": 1.6909106103865553e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10959 }, { "completion_length": 572.75, "epoch": 3.0376940133037693, "grad_norm": 0.0, "kl": 0.17970870435237885, "learning_rate": 1.6904963923562506e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10960 }, { "completion_length": 587.75, "epoch": 3.037971175166297, "grad_norm": 0.0, "kl": 0.17972847819328308, "learning_rate": 1.6900821991500555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10961 }, { "completion_length": 659.0, "epoch": 3.0382483370288247, "grad_norm": 0.3557971715927124, "kl": 0.15808171033859253, "learning_rate": 1.6896680307806687e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10962 }, { "completion_length": 626.25, "epoch": 3.0385254988913526, "grad_norm": 0.0, "kl": 0.1525956243276596, "learning_rate": 1.6892538872607936e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10963 }, { "completion_length": 660.25, "epoch": 3.03880266075388, "grad_norm": 0.0, "kl": 0.18126635253429413, "learning_rate": 1.6888397686031286e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10964 }, { "completion_length": 543.5, "epoch": 3.039079822616408, "grad_norm": 0.0, "kl": 0.2716100513935089, "learning_rate": 1.6884256748203742e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10965 }, { "completion_length": 662.25, "epoch": 3.039356984478936, "grad_norm": 0.3760834336280823, "kl": 0.17690473794937134, "learning_rate": 1.6880116059252275e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10966 }, { "completion_length": 602.75, "epoch": 3.0396341463414633, "grad_norm": 0.38113507628440857, "kl": 0.16453507542610168, "learning_rate": 1.6875975619303872e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10967 }, { "completion_length": 604.5, "epoch": 3.0399113082039912, "grad_norm": 0.0, "kl": 0.19121390581130981, "learning_rate": 1.6871835428485505e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10968 }, { "completion_length": 566.25, "epoch": 3.0401884700665187, "grad_norm": 0.0, "kl": 0.1975264847278595, "learning_rate": 1.686769548692413e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10969 }, { "completion_length": 584.5, "epoch": 3.0404656319290466, "grad_norm": 0.0, "kl": 2.2482943534851074, "learning_rate": 1.686355579474671e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10970 }, { "completion_length": 583.75, "epoch": 3.040742793791574, "grad_norm": 0.0, "kl": 0.17771941423416138, "learning_rate": 1.6859416352080177e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10971 }, { "completion_length": 554.75, "epoch": 3.041019955654102, "grad_norm": 0.0, "kl": 0.1875653862953186, "learning_rate": 1.6855277159051494e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10972 }, { "completion_length": 698.25, "epoch": 3.04129711751663, "grad_norm": 0.0, "kl": 0.1727486252784729, "learning_rate": 1.6851138215787567e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10973 }, { "completion_length": 551.25, "epoch": 3.0415742793791574, "grad_norm": 0.0, "kl": 0.20742762088775635, "learning_rate": 1.6846999522415335e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10974 }, { "completion_length": 544.25, "epoch": 3.0418514412416853, "grad_norm": 0.0, "kl": 41948520448.0, "learning_rate": 1.6842861079061717e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10975 }, { "completion_length": 605.5, "epoch": 3.0421286031042127, "grad_norm": 0.45340538024902344, "kl": 22828013568.0, "learning_rate": 1.6838722885853615e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10976 }, { "completion_length": 585.0, "epoch": 3.0424057649667406, "grad_norm": 0.0, "kl": 0.17690135538578033, "learning_rate": 1.6834584942917934e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10977 }, { "completion_length": 674.5, "epoch": 3.042682926829268, "grad_norm": 0.0, "kl": 0.22499516606330872, "learning_rate": 1.683044725038156e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10978 }, { "completion_length": 562.5, "epoch": 3.042960088691796, "grad_norm": 0.0, "kl": 0.21928073465824127, "learning_rate": 1.6826309808371391e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10979 }, { "completion_length": 619.25, "epoch": 3.043237250554324, "grad_norm": 0.0, "kl": 0.18100808560848236, "learning_rate": 1.6822172617014293e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10980 }, { "completion_length": 571.25, "epoch": 3.0435144124168514, "grad_norm": 0.0, "kl": 0.18645299971103668, "learning_rate": 1.681803567643714e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10981 }, { "completion_length": 531.0, "epoch": 3.0437915742793793, "grad_norm": 4.820028305053711, "kl": 0.2109239101409912, "learning_rate": 1.6813898986766803e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10982 }, { "completion_length": 566.5, "epoch": 3.0440687361419068, "grad_norm": 0.0, "kl": 0.1828662008047104, "learning_rate": 1.6809762548130126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10983 }, { "completion_length": 541.5, "epoch": 3.0443458980044347, "grad_norm": 0.0, "kl": 0.16831150650978088, "learning_rate": 1.6805626360653965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10984 }, { "completion_length": 471.0, "epoch": 3.044623059866962, "grad_norm": 0.0, "kl": 0.22305887937545776, "learning_rate": 1.6801490424465154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10985 }, { "completion_length": 550.25, "epoch": 3.04490022172949, "grad_norm": 0.0, "kl": 0.161863774061203, "learning_rate": 1.679735473969053e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10986 }, { "completion_length": 584.25, "epoch": 3.045177383592018, "grad_norm": 0.0, "kl": 0.24641650915145874, "learning_rate": 1.6793219306456906e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10987 }, { "completion_length": 582.75, "epoch": 3.0454545454545454, "grad_norm": 0.0, "kl": 0.1576412320137024, "learning_rate": 1.678908412489111e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10988 }, { "completion_length": 598.75, "epoch": 3.0457317073170733, "grad_norm": 0.0, "kl": 0.3107454478740692, "learning_rate": 1.6784949195119954e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10989 }, { "completion_length": 536.25, "epoch": 3.046008869179601, "grad_norm": 0.0, "kl": 0.17168021202087402, "learning_rate": 1.678081451727022e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10990 }, { "completion_length": 586.0, "epoch": 3.0462860310421287, "grad_norm": 0.0, "kl": 0.17898257076740265, "learning_rate": 1.6776680091468727e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10991 }, { "completion_length": 527.5, "epoch": 3.046563192904656, "grad_norm": 0.0, "kl": 0.17531675100326538, "learning_rate": 1.6772545917842237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10992 }, { "completion_length": 582.0, "epoch": 3.046840354767184, "grad_norm": 0.0, "kl": 31149095124992.0, "learning_rate": 1.6768411996517548e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10993 }, { "completion_length": 507.25, "epoch": 3.0471175166297115, "grad_norm": 0.5943042635917664, "kl": 5531551744.0, "learning_rate": 1.6764278327621409e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10994 }, { "completion_length": 585.75, "epoch": 3.0473946784922394, "grad_norm": 0.0, "kl": 0.23107081651687622, "learning_rate": 1.6760144911280598e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10995 }, { "completion_length": 506.75, "epoch": 3.0476718403547673, "grad_norm": 0.0, "kl": 0.28132835030555725, "learning_rate": 1.675601174762187e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10996 }, { "completion_length": 554.75, "epoch": 3.047949002217295, "grad_norm": 0.0, "kl": 0.21876554191112518, "learning_rate": 1.6751878836771965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10997 }, { "completion_length": 616.5, "epoch": 3.0482261640798227, "grad_norm": 0.0, "kl": 0.17996945977210999, "learning_rate": 1.6747746178857627e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10998 }, { "completion_length": 482.25, "epoch": 3.04850332594235, "grad_norm": 0.0, "kl": 0.19629235565662384, "learning_rate": 1.6743613774005583e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 10999 }, { "completion_length": 604.75, "epoch": 3.048780487804878, "grad_norm": 0.0, "kl": 0.201067715883255, "learning_rate": 1.6739481622342563e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11000 }, { "completion_length": 606.5, "epoch": 3.0490576496674056, "grad_norm": 0.0, "kl": 0.22569046914577484, "learning_rate": 1.6735349723995263e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11001 }, { "completion_length": 500.25, "epoch": 3.0493348115299335, "grad_norm": 0.0, "kl": 0.5040731430053711, "learning_rate": 1.673121807909043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11002 }, { "completion_length": 597.5, "epoch": 3.0496119733924614, "grad_norm": 0.0, "kl": 0.19441622495651245, "learning_rate": 1.6727086687754724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11003 }, { "completion_length": 736.0, "epoch": 3.049889135254989, "grad_norm": 0.0, "kl": 0.16855227947235107, "learning_rate": 1.672295555011486e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11004 }, { "completion_length": 524.5, "epoch": 3.0501662971175167, "grad_norm": 0.0, "kl": 0.18640132248401642, "learning_rate": 1.671882466629752e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11005 }, { "completion_length": 619.0, "epoch": 3.050443458980044, "grad_norm": 2.2728631496429443, "kl": 2537183.5, "learning_rate": 1.6714694036429375e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11006 }, { "completion_length": 606.0, "epoch": 3.050720620842572, "grad_norm": 0.0, "kl": 0.19128775596618652, "learning_rate": 1.6710563660637103e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11007 }, { "completion_length": 613.75, "epoch": 3.0509977827050996, "grad_norm": 0.0, "kl": 0.1828041970729828, "learning_rate": 1.6706433539047356e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11008 }, { "completion_length": 564.25, "epoch": 3.0512749445676275, "grad_norm": 0.0, "kl": 0.1885972023010254, "learning_rate": 1.6702303671786795e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11009 }, { "completion_length": 592.5, "epoch": 3.0515521064301554, "grad_norm": 0.0, "kl": 0.2813093364238739, "learning_rate": 1.6698174058982058e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11010 }, { "completion_length": 538.0, "epoch": 3.051829268292683, "grad_norm": 0.0, "kl": 0.218556210398674, "learning_rate": 1.6694044700759788e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11011 }, { "completion_length": 505.0, "epoch": 3.0521064301552108, "grad_norm": 0.0, "kl": 28780655542272.0, "learning_rate": 1.6689915597246625e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11012 }, { "completion_length": 725.75, "epoch": 3.0523835920177382, "grad_norm": 0.0, "kl": 0.12339400500059128, "learning_rate": 1.6685786748569167e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11013 }, { "completion_length": 846.0, "epoch": 3.052660753880266, "grad_norm": 0.0, "kl": 0.18365724384784698, "learning_rate": 1.6681658154854053e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11014 }, { "completion_length": 588.75, "epoch": 3.0529379157427936, "grad_norm": 0.4209543466567993, "kl": 1576795774124032.0, "learning_rate": 1.6677529816227877e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11015 }, { "completion_length": 585.5, "epoch": 3.0532150776053215, "grad_norm": 9.01918888092041, "kl": 72108.59375, "learning_rate": 1.6673401732817247e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11016 }, { "completion_length": 550.75, "epoch": 3.0534922394678494, "grad_norm": 0.3821612298488617, "kl": 8901461991751680.0, "learning_rate": 1.666927390474874e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11017 }, { "completion_length": 560.0, "epoch": 3.053769401330377, "grad_norm": 0.0, "kl": 0.181517094373703, "learning_rate": 1.6665146332148947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11018 }, { "completion_length": 513.75, "epoch": 3.054046563192905, "grad_norm": 3.8329999446868896, "kl": 1646474.875, "learning_rate": 1.6661019015144447e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11019 }, { "completion_length": 599.0, "epoch": 3.0543237250554323, "grad_norm": 0.0, "kl": 0.14588268101215363, "learning_rate": 1.6656891953861804e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11020 }, { "completion_length": 566.75, "epoch": 3.05460088691796, "grad_norm": 0.0, "kl": 0.3225169777870178, "learning_rate": 1.6652765148427583e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11021 }, { "completion_length": 617.0, "epoch": 3.0548780487804876, "grad_norm": 0.0, "kl": 0.18624386191368103, "learning_rate": 1.664863859896832e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11022 }, { "completion_length": 502.0, "epoch": 3.0551552106430155, "grad_norm": 0.0, "kl": 0.2016269713640213, "learning_rate": 1.6644512305610586e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11023 }, { "completion_length": 561.75, "epoch": 3.0554323725055434, "grad_norm": 0.0, "kl": 0.17112700641155243, "learning_rate": 1.6640386268480886e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11024 }, { "completion_length": 568.0, "epoch": 3.055709534368071, "grad_norm": 0.0, "kl": 0.20667067170143127, "learning_rate": 1.6636260487705769e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11025 }, { "completion_length": 497.25, "epoch": 3.055986696230599, "grad_norm": 0.0, "kl": 0.21190649271011353, "learning_rate": 1.6632134963411756e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11026 }, { "completion_length": 551.0, "epoch": 3.0562638580931263, "grad_norm": 0.0, "kl": 0.5457439422607422, "learning_rate": 1.6628009695725348e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11027 }, { "completion_length": 530.5, "epoch": 3.056541019955654, "grad_norm": 0.0, "kl": 0.18699075281620026, "learning_rate": 1.662388468477306e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11028 }, { "completion_length": 586.75, "epoch": 3.0568181818181817, "grad_norm": 0.0, "kl": 0.2018488496541977, "learning_rate": 1.6619759930681385e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11029 }, { "completion_length": 646.75, "epoch": 3.0570953436807096, "grad_norm": 0.0, "kl": 0.2086290717124939, "learning_rate": 1.6615635433576814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11030 }, { "completion_length": 554.0, "epoch": 3.0573725055432375, "grad_norm": 1.9880192279815674, "kl": 31763418054656.0, "learning_rate": 1.661151119358582e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11031 }, { "completion_length": 601.75, "epoch": 3.057649667405765, "grad_norm": 0.0, "kl": 0.23420867323875427, "learning_rate": 1.6607387210834889e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11032 }, { "completion_length": 598.0, "epoch": 3.057926829268293, "grad_norm": 0.3531784117221832, "kl": 0.4650040566921234, "learning_rate": 1.6603263485450465e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11033 }, { "completion_length": 547.0, "epoch": 3.0582039911308203, "grad_norm": 0.0, "kl": 0.24025624990463257, "learning_rate": 1.6599140017559033e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11034 }, { "completion_length": 558.75, "epoch": 3.058481152993348, "grad_norm": 0.0, "kl": 0.20510455965995789, "learning_rate": 1.6595016807287032e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11035 }, { "completion_length": 676.5, "epoch": 3.0587583148558757, "grad_norm": 0.0, "kl": 0.16862687468528748, "learning_rate": 1.6590893854760895e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11036 }, { "completion_length": 642.75, "epoch": 3.0590354767184036, "grad_norm": 0.0, "kl": 0.1581854671239853, "learning_rate": 1.6586771160107068e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11037 }, { "completion_length": 577.0, "epoch": 3.059312638580931, "grad_norm": 0.48047196865081787, "kl": 1204531362791424.0, "learning_rate": 1.6582648723451965e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11038 }, { "completion_length": 608.0, "epoch": 3.059589800443459, "grad_norm": 0.43021389842033386, "kl": 7.454725742340088, "learning_rate": 1.657852654492202e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11039 }, { "completion_length": 680.0, "epoch": 3.059866962305987, "grad_norm": 0.0, "kl": 0.14615032076835632, "learning_rate": 1.6574404624643626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11040 }, { "completion_length": 651.5, "epoch": 3.0601441241685143, "grad_norm": 0.0, "kl": 0.16627953946590424, "learning_rate": 1.6570282962743187e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11041 }, { "completion_length": 636.75, "epoch": 3.0604212860310422, "grad_norm": 0.0, "kl": 0.15922625362873077, "learning_rate": 1.6566161559347119e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11042 }, { "completion_length": 620.75, "epoch": 3.0606984478935697, "grad_norm": 0.0, "kl": 0.1559077799320221, "learning_rate": 1.6562040414581776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11043 }, { "completion_length": 662.5, "epoch": 3.0609756097560976, "grad_norm": 0.0, "kl": 0.1701987385749817, "learning_rate": 1.655791952857357e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11044 }, { "completion_length": 502.5, "epoch": 3.061252771618625, "grad_norm": 0.0, "kl": 0.20567597448825836, "learning_rate": 1.6553798901448835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11045 }, { "completion_length": 610.75, "epoch": 3.061529933481153, "grad_norm": 0.0, "kl": 0.1884833574295044, "learning_rate": 1.6549678533333962e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11046 }, { "completion_length": 580.5, "epoch": 3.061807095343681, "grad_norm": 0.477631539106369, "kl": 0.1922263205051422, "learning_rate": 1.6545558424355292e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11047 }, { "completion_length": 649.5, "epoch": 3.0620842572062084, "grad_norm": 0.0, "kl": 0.1681283712387085, "learning_rate": 1.654143857463918e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11048 }, { "completion_length": 557.75, "epoch": 3.0623614190687363, "grad_norm": 0.0, "kl": 0.1617843508720398, "learning_rate": 1.653731898431196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11049 }, { "completion_length": 648.25, "epoch": 3.0626385809312637, "grad_norm": 0.0, "kl": 0.16182811558246613, "learning_rate": 1.653319965349996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11050 }, { "completion_length": 685.75, "epoch": 3.0629157427937916, "grad_norm": 0.0, "kl": 0.3571719825267792, "learning_rate": 1.6529080582329509e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11051 }, { "completion_length": 470.75, "epoch": 3.063192904656319, "grad_norm": 0.0, "kl": 0.43309032917022705, "learning_rate": 1.6524961770926909e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11052 }, { "completion_length": 607.25, "epoch": 3.063470066518847, "grad_norm": 0.0, "kl": 0.19527730345726013, "learning_rate": 1.6520843219418492e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11053 }, { "completion_length": 550.75, "epoch": 3.063747228381375, "grad_norm": 6.768996238708496, "kl": 18387896320.0, "learning_rate": 1.6516724927930525e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11054 }, { "completion_length": 581.75, "epoch": 3.0640243902439024, "grad_norm": 0.0, "kl": 0.1885385513305664, "learning_rate": 1.6512606896589323e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11055 }, { "completion_length": 485.0, "epoch": 3.0643015521064303, "grad_norm": 3.9222874641418457, "kl": 0.3134271204471588, "learning_rate": 1.6508489125521159e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11056 }, { "completion_length": 567.25, "epoch": 3.0645787139689578, "grad_norm": 0.0, "kl": 0.16957500576972961, "learning_rate": 1.650437161485231e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11057 }, { "completion_length": 581.0, "epoch": 3.0648558758314857, "grad_norm": 0.0, "kl": 0.1890944242477417, "learning_rate": 1.6500254364709048e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11058 }, { "completion_length": 971.5, "epoch": 3.065133037694013, "grad_norm": 0.23789775371551514, "kl": 0.15243080258369446, "learning_rate": 1.649613737521762e-06, "loss": 0.0, "reward": 4.09375, "reward_std": 3.3125, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 11059 }, { "completion_length": 576.75, "epoch": 3.065410199556541, "grad_norm": 0.0, "kl": 0.19632169604301453, "learning_rate": 1.6492020646504289e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11060 }, { "completion_length": 626.0, "epoch": 3.065687361419069, "grad_norm": 0.0, "kl": 0.15593869984149933, "learning_rate": 1.6487904178695285e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11061 }, { "completion_length": 549.0, "epoch": 3.0659645232815964, "grad_norm": 0.0, "kl": 0.208607017993927, "learning_rate": 1.648378797191686e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11062 }, { "completion_length": 522.5, "epoch": 3.0662416851441243, "grad_norm": 0.5005469918251038, "kl": 0.2173619121313095, "learning_rate": 1.6479672026295223e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11063 }, { "completion_length": 703.25, "epoch": 3.066518847006652, "grad_norm": 0.0, "kl": 0.16423748433589935, "learning_rate": 1.64755563419566e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11064 }, { "completion_length": 484.75, "epoch": 3.0667960088691797, "grad_norm": 0.0, "kl": 11487289344.0, "learning_rate": 1.647144091902721e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11065 }, { "completion_length": 529.75, "epoch": 3.067073170731707, "grad_norm": 0.0, "kl": 214099148800.0, "learning_rate": 1.6467325757633242e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11066 }, { "completion_length": 565.5, "epoch": 3.067350332594235, "grad_norm": 0.0, "kl": 0.17013342678546906, "learning_rate": 1.6463210857900907e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11067 }, { "completion_length": 592.5, "epoch": 3.0676274944567625, "grad_norm": 0.0, "kl": 0.1997860074043274, "learning_rate": 1.6459096219956377e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11068 }, { "completion_length": 597.25, "epoch": 3.0679046563192904, "grad_norm": 0.0, "kl": 0.18647895753383636, "learning_rate": 1.6454981843925838e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11069 }, { "completion_length": 643.5, "epoch": 3.0681818181818183, "grad_norm": 0.0, "kl": 0.17325933277606964, "learning_rate": 1.645086772993546e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11070 }, { "completion_length": 647.25, "epoch": 3.068458980044346, "grad_norm": 0.0, "kl": 0.1646016240119934, "learning_rate": 1.6446753878111396e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11071 }, { "completion_length": 594.25, "epoch": 3.0687361419068737, "grad_norm": 0.0, "kl": 7.105718612670898, "learning_rate": 1.6442640288579828e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11072 }, { "completion_length": 598.0, "epoch": 3.069013303769401, "grad_norm": 0.0, "kl": 666628853334016.0, "learning_rate": 1.6438526961466866e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11073 }, { "completion_length": 557.0, "epoch": 3.069290465631929, "grad_norm": 0.0, "kl": 0.17080485820770264, "learning_rate": 1.6434413896898689e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11074 }, { "completion_length": 557.0, "epoch": 3.0695676274944566, "grad_norm": 0.6361745595932007, "kl": 2484070302875648.0, "learning_rate": 1.6430301095001383e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11075 }, { "completion_length": 581.0, "epoch": 3.0698447893569845, "grad_norm": 1.7985440492630005, "kl": 17769424.0, "learning_rate": 1.6426188555901108e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11076 }, { "completion_length": 541.0, "epoch": 3.0701219512195124, "grad_norm": 0.0, "kl": 0.18324726819992065, "learning_rate": 1.6422076279723957e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11077 }, { "completion_length": 555.75, "epoch": 3.07039911308204, "grad_norm": 0.0, "kl": 1801423234793472.0, "learning_rate": 1.6417964266596043e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11078 }, { "completion_length": 535.75, "epoch": 3.0706762749445677, "grad_norm": 0.0, "kl": 0.18417677283287048, "learning_rate": 1.6413852516643468e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11079 }, { "completion_length": 648.25, "epoch": 3.070953436807095, "grad_norm": 0.0, "kl": 0.153103768825531, "learning_rate": 1.6409741029992318e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11080 }, { "completion_length": 581.0, "epoch": 3.071230598669623, "grad_norm": 0.0, "kl": 0.19727590680122375, "learning_rate": 1.640562980676868e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11081 }, { "completion_length": 524.25, "epoch": 3.0715077605321506, "grad_norm": 0.0, "kl": 0.3622903525829315, "learning_rate": 1.6401518847098618e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11082 }, { "completion_length": 639.75, "epoch": 3.0717849223946785, "grad_norm": 0.0, "kl": 0.15451906621456146, "learning_rate": 1.6397408151108208e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11083 }, { "completion_length": 566.75, "epoch": 3.0720620842572064, "grad_norm": 0.9389859437942505, "kl": 1856071861796864.0, "learning_rate": 1.6393297718923494e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11084 }, { "completion_length": 608.75, "epoch": 3.072339246119734, "grad_norm": 0.0, "kl": 0.17999859154224396, "learning_rate": 1.6389187550670544e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11085 }, { "completion_length": 592.5, "epoch": 3.0726164079822618, "grad_norm": 0.0, "kl": 1.400903344154358, "learning_rate": 1.6385077646475394e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11086 }, { "completion_length": 551.0, "epoch": 3.0728935698447892, "grad_norm": 0.0, "kl": 630747299840.0, "learning_rate": 1.6380968006464073e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11087 }, { "completion_length": 622.25, "epoch": 3.073170731707317, "grad_norm": 0.0, "kl": 486083225714688.0, "learning_rate": 1.6376858630762616e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11088 }, { "completion_length": 596.75, "epoch": 3.0734478935698446, "grad_norm": 0.0, "kl": 0.20670512318611145, "learning_rate": 1.6372749519497023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11089 }, { "completion_length": 569.75, "epoch": 3.0737250554323725, "grad_norm": 0.0, "kl": 0.23640701174736023, "learning_rate": 1.6368640672793321e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11090 }, { "completion_length": 520.0, "epoch": 3.0740022172949004, "grad_norm": 0.0, "kl": 0.1909506767988205, "learning_rate": 1.6364532090777497e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11091 }, { "completion_length": 478.5, "epoch": 3.074279379157428, "grad_norm": 0.0, "kl": 0.20373408496379852, "learning_rate": 1.6360423773575548e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11092 }, { "completion_length": 526.5, "epoch": 3.074556541019956, "grad_norm": 0.0, "kl": 0.21505174040794373, "learning_rate": 1.6356315721313475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11093 }, { "completion_length": 582.75, "epoch": 3.0748337028824833, "grad_norm": 0.0, "kl": 0.1744326651096344, "learning_rate": 1.6352207934117232e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11094 }, { "completion_length": 594.25, "epoch": 3.075110864745011, "grad_norm": 0.0, "kl": 0.20484574139118195, "learning_rate": 1.6348100412112804e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11095 }, { "completion_length": 633.5, "epoch": 3.0753880266075386, "grad_norm": 0.0, "kl": 0.15728473663330078, "learning_rate": 1.634399315542614e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11096 }, { "completion_length": 517.25, "epoch": 3.0756651884700665, "grad_norm": 0.0, "kl": 0.31599727272987366, "learning_rate": 1.6339886164183205e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11097 }, { "completion_length": 616.0, "epoch": 3.0759423503325944, "grad_norm": 0.0, "kl": 0.18771392107009888, "learning_rate": 1.633577943850993e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11098 }, { "completion_length": 615.5, "epoch": 3.076219512195122, "grad_norm": 0.0, "kl": 1014944064.0, "learning_rate": 1.6331672978532264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11099 }, { "completion_length": 582.75, "epoch": 3.07649667405765, "grad_norm": 0.0, "kl": 0.19013966619968414, "learning_rate": 1.632756678437612e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11100 }, { "completion_length": 577.75, "epoch": 3.0767738359201773, "grad_norm": 0.0, "kl": 0.1795678436756134, "learning_rate": 1.6323460856167427e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11101 }, { "completion_length": 604.5, "epoch": 3.077050997782705, "grad_norm": 0.0, "kl": 0.2972947359085083, "learning_rate": 1.6319355194032105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11102 }, { "completion_length": 535.75, "epoch": 3.0773281596452327, "grad_norm": 0.0, "kl": 0.7291284799575806, "learning_rate": 1.6315249798096032e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11103 }, { "completion_length": 648.25, "epoch": 3.0776053215077606, "grad_norm": 0.0, "kl": 0.22619858384132385, "learning_rate": 1.631114466848514e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11104 }, { "completion_length": 649.5, "epoch": 3.0778824833702885, "grad_norm": 0.0, "kl": 0.16741347312927246, "learning_rate": 1.630703980532528e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11105 }, { "completion_length": 648.0, "epoch": 3.078159645232816, "grad_norm": 0.0, "kl": 3621864079360.0, "learning_rate": 1.6302935208742366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11106 }, { "completion_length": 577.0, "epoch": 3.078436807095344, "grad_norm": 0.0, "kl": 0.18415994942188263, "learning_rate": 1.6298830878862228e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11107 }, { "completion_length": 553.5, "epoch": 3.0787139689578713, "grad_norm": 0.0, "kl": 0.2040734887123108, "learning_rate": 1.629472681581076e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11108 }, { "completion_length": 544.25, "epoch": 3.078991130820399, "grad_norm": 0.0, "kl": 0.15271708369255066, "learning_rate": 1.6290623019713813e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11109 }, { "completion_length": 624.25, "epoch": 3.0792682926829267, "grad_norm": 0.0, "kl": 0.1815957874059677, "learning_rate": 1.6286519490697223e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11110 }, { "completion_length": 607.75, "epoch": 3.0795454545454546, "grad_norm": 0.0, "kl": 0.20017902553081512, "learning_rate": 1.6282416228886838e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11111 }, { "completion_length": 650.0, "epoch": 3.079822616407982, "grad_norm": 0.0, "kl": 0.15020287036895752, "learning_rate": 1.6278313234408477e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11112 }, { "completion_length": 656.0, "epoch": 3.08009977827051, "grad_norm": 0.0, "kl": 8897.7490234375, "learning_rate": 1.6274210507387972e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11113 }, { "completion_length": 552.5, "epoch": 3.080376940133038, "grad_norm": 0.0, "kl": 0.18918104469776154, "learning_rate": 1.6270108047951126e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11114 }, { "completion_length": 519.25, "epoch": 3.0806541019955653, "grad_norm": 0.3907712697982788, "kl": 5676833280.0, "learning_rate": 1.626600585622375e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11115 }, { "completion_length": 567.0, "epoch": 3.0809312638580932, "grad_norm": 0.0, "kl": 0.18495424091815948, "learning_rate": 1.6261903932331649e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11116 }, { "completion_length": 525.5, "epoch": 3.0812084257206207, "grad_norm": 6.325798511505127, "kl": 3412796.25, "learning_rate": 1.6257802276400604e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11117 }, { "completion_length": 626.0, "epoch": 3.0814855875831486, "grad_norm": 1.634893536567688, "kl": 50024116.0, "learning_rate": 1.6253700888556395e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11118 }, { "completion_length": 616.75, "epoch": 3.081762749445676, "grad_norm": 0.42642974853515625, "kl": 1.0151099369521152e+16, "learning_rate": 1.6249599768924796e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11119 }, { "completion_length": 526.75, "epoch": 3.082039911308204, "grad_norm": 0.0, "kl": 0.1924879103899002, "learning_rate": 1.6245498917631575e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11120 }, { "completion_length": 558.75, "epoch": 3.082317073170732, "grad_norm": 0.0, "kl": 0.20151863992214203, "learning_rate": 1.6241398334802478e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11121 }, { "completion_length": 517.75, "epoch": 3.0825942350332594, "grad_norm": 0.0, "kl": 0.456821084022522, "learning_rate": 1.6237298020563253e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11122 }, { "completion_length": 530.25, "epoch": 3.0828713968957873, "grad_norm": 0.0, "kl": 0.16841678321361542, "learning_rate": 1.6233197975039663e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11123 }, { "completion_length": 537.0, "epoch": 3.0831485587583147, "grad_norm": 0.0, "kl": 0.2159399539232254, "learning_rate": 1.6229098198357406e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11124 }, { "completion_length": 548.5, "epoch": 3.0834257206208426, "grad_norm": 0.0, "kl": 0.2021501660346985, "learning_rate": 1.6224998690642241e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11125 }, { "completion_length": 572.5, "epoch": 3.08370288248337, "grad_norm": 0.0, "kl": 0.16832944750785828, "learning_rate": 1.6220899452019843e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11126 }, { "completion_length": 535.75, "epoch": 3.083980044345898, "grad_norm": 0.0, "kl": 78090298458112.0, "learning_rate": 1.621680048261595e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11127 }, { "completion_length": 595.0, "epoch": 3.084257206208426, "grad_norm": 0.0, "kl": 0.21847029030323029, "learning_rate": 1.6212701782556245e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11128 }, { "completion_length": 506.75, "epoch": 3.0845343680709534, "grad_norm": 0.0, "kl": 0.22169557213783264, "learning_rate": 1.6208603351966423e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11129 }, { "completion_length": 536.5, "epoch": 3.0848115299334813, "grad_norm": 0.0, "kl": 0.18783971667289734, "learning_rate": 1.6204505190972165e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11130 }, { "completion_length": 552.75, "epoch": 3.0850886917960088, "grad_norm": 0.0, "kl": 0.17342111468315125, "learning_rate": 1.6200407299699141e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11131 }, { "completion_length": 557.25, "epoch": 3.0853658536585367, "grad_norm": 0.9091874361038208, "kl": 3.083344136896512e+16, "learning_rate": 1.6196309678273025e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11132 }, { "completion_length": 497.75, "epoch": 3.085643015521064, "grad_norm": 0.0, "kl": 0.16749811172485352, "learning_rate": 1.6192212326819467e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11133 }, { "completion_length": 549.5, "epoch": 3.085920177383592, "grad_norm": 1.0898518562316895, "kl": 134385399496704.0, "learning_rate": 1.618811524546412e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11134 }, { "completion_length": 647.25, "epoch": 3.08619733924612, "grad_norm": 0.0, "kl": 0.29188570380210876, "learning_rate": 1.618401843433261e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11135 }, { "completion_length": 555.75, "epoch": 3.0864745011086474, "grad_norm": 0.0, "kl": 0.24319681525230408, "learning_rate": 1.6179921893550599e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11136 }, { "completion_length": 527.5, "epoch": 3.0867516629711753, "grad_norm": 0.0, "kl": 0.21934129297733307, "learning_rate": 1.617582562324368e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11137 }, { "completion_length": 614.75, "epoch": 3.087028824833703, "grad_norm": 1.8409998416900635, "kl": 3305183.25, "learning_rate": 1.617172962353748e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11138 }, { "completion_length": 581.0, "epoch": 3.0873059866962307, "grad_norm": 0.0, "kl": 0.36397314071655273, "learning_rate": 1.616763389455762e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11139 }, { "completion_length": 711.25, "epoch": 3.087583148558758, "grad_norm": 0.0, "kl": 0.15871089696884155, "learning_rate": 1.6163538436429677e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11140 }, { "completion_length": 515.25, "epoch": 3.087860310421286, "grad_norm": 0.0, "kl": 0.1872534602880478, "learning_rate": 1.6159443249279256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11141 }, { "completion_length": 682.0, "epoch": 3.0881374722838135, "grad_norm": 0.0, "kl": 0.15938328206539154, "learning_rate": 1.6155348333231931e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11142 }, { "completion_length": 526.75, "epoch": 3.0884146341463414, "grad_norm": 0.0, "kl": 0.21317094564437866, "learning_rate": 1.6151253688413287e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11143 }, { "completion_length": 588.25, "epoch": 3.0886917960088693, "grad_norm": 0.0, "kl": 0.2209438532590866, "learning_rate": 1.6147159314948873e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11144 }, { "completion_length": 576.25, "epoch": 3.088968957871397, "grad_norm": 0.0, "kl": 0.20082199573516846, "learning_rate": 1.6143065212964253e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11145 }, { "completion_length": 586.75, "epoch": 3.0892461197339247, "grad_norm": 0.0, "kl": 0.20448069274425507, "learning_rate": 1.613897138258499e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11146 }, { "completion_length": 603.75, "epoch": 3.089523281596452, "grad_norm": 0.0, "kl": 0.17276056110858917, "learning_rate": 1.613487782393661e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11147 }, { "completion_length": 657.5, "epoch": 3.08980044345898, "grad_norm": 0.0, "kl": 0.1683003306388855, "learning_rate": 1.613078453714465e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11148 }, { "completion_length": 598.75, "epoch": 3.0900776053215075, "grad_norm": 0.0, "kl": 0.19423207640647888, "learning_rate": 1.6126691522334632e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11149 }, { "completion_length": 639.75, "epoch": 3.0903547671840355, "grad_norm": 0.0, "kl": 0.18334802985191345, "learning_rate": 1.6122598779632076e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11150 }, { "completion_length": 685.0, "epoch": 3.0906319290465634, "grad_norm": 0.0, "kl": 0.14725355803966522, "learning_rate": 1.6118506309162483e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11151 }, { "completion_length": 605.5, "epoch": 3.090909090909091, "grad_norm": 0.0, "kl": 0.15539005398750305, "learning_rate": 1.6114414111051351e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11152 }, { "completion_length": 534.25, "epoch": 3.0911862527716187, "grad_norm": 0.0, "kl": 0.18066614866256714, "learning_rate": 1.6110322185424188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11153 }, { "completion_length": 581.5, "epoch": 3.091463414634146, "grad_norm": 0.0, "kl": 0.1946004033088684, "learning_rate": 1.6106230532406448e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11154 }, { "completion_length": 585.0, "epoch": 3.091740576496674, "grad_norm": 0.0, "kl": 0.2823137640953064, "learning_rate": 1.6102139152123637e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11155 }, { "completion_length": 593.25, "epoch": 3.0920177383592016, "grad_norm": 0.0, "kl": 28.121742248535156, "learning_rate": 1.6098048044701186e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11156 }, { "completion_length": 522.75, "epoch": 3.0922949002217295, "grad_norm": 0.0, "kl": 0.24412375688552856, "learning_rate": 1.6093957210264588e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11157 }, { "completion_length": 517.0, "epoch": 3.0925720620842574, "grad_norm": 0.0, "kl": 0.19616690278053284, "learning_rate": 1.6089866648939256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11158 }, { "completion_length": 588.75, "epoch": 3.092849223946785, "grad_norm": 0.0, "kl": 0.30966460704803467, "learning_rate": 1.6085776360850664e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11159 }, { "completion_length": 544.75, "epoch": 3.0931263858093128, "grad_norm": 0.3934139609336853, "kl": 0.19987677037715912, "learning_rate": 1.608168634612422e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11160 }, { "completion_length": 619.25, "epoch": 3.0934035476718402, "grad_norm": 0.0, "kl": 2.6579816341400146, "learning_rate": 1.6077596604885359e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11161 }, { "completion_length": 624.5, "epoch": 3.093680709534368, "grad_norm": 0.0, "kl": 66310180864.0, "learning_rate": 1.6073507137259497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11162 }, { "completion_length": 514.5, "epoch": 3.0939578713968956, "grad_norm": 0.0, "kl": 0.20828601717948914, "learning_rate": 1.6069417943372032e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11163 }, { "completion_length": 540.0, "epoch": 3.0942350332594235, "grad_norm": 0.0, "kl": 0.17438653111457825, "learning_rate": 1.6065329023348375e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11164 }, { "completion_length": 542.0, "epoch": 3.0945121951219514, "grad_norm": 0.0, "kl": 0.1635292023420334, "learning_rate": 1.6061240377313897e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11165 }, { "completion_length": 588.5, "epoch": 3.094789356984479, "grad_norm": 0.39125239849090576, "kl": 138125950976.0, "learning_rate": 1.6057152005394012e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11166 }, { "completion_length": 481.0, "epoch": 3.095066518847007, "grad_norm": 0.0, "kl": 0.20159302651882172, "learning_rate": 1.6053063907714056e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11167 }, { "completion_length": 629.25, "epoch": 3.0953436807095343, "grad_norm": 0.0, "kl": 0.15613535046577454, "learning_rate": 1.6048976084399414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11168 }, { "completion_length": 584.5, "epoch": 3.095620842572062, "grad_norm": 0.0, "kl": 0.15495559573173523, "learning_rate": 1.6044888535575448e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11169 }, { "completion_length": 523.25, "epoch": 3.0958980044345896, "grad_norm": 0.0, "kl": 0.1860763281583786, "learning_rate": 1.6040801261367494e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11170 }, { "completion_length": 553.25, "epoch": 3.0961751662971175, "grad_norm": 0.0, "kl": 0.2864345908164978, "learning_rate": 1.60367142619009e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11171 }, { "completion_length": 610.0, "epoch": 3.0964523281596454, "grad_norm": 0.0, "kl": 0.1574983149766922, "learning_rate": 1.6032627537300988e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11172 }, { "completion_length": 671.25, "epoch": 3.096729490022173, "grad_norm": 0.0, "kl": 0.14972050487995148, "learning_rate": 1.6028541087693091e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11173 }, { "completion_length": 565.5, "epoch": 3.097006651884701, "grad_norm": 0.0, "kl": 0.16412445902824402, "learning_rate": 1.6024454913202514e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11174 }, { "completion_length": 588.0, "epoch": 3.0972838137472283, "grad_norm": 3.0388429164886475, "kl": 4705962496.0, "learning_rate": 1.6020369013954562e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11175 }, { "completion_length": 613.25, "epoch": 3.097560975609756, "grad_norm": 0.0, "kl": 0.1688978224992752, "learning_rate": 1.6016283390074554e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11176 }, { "completion_length": 498.0, "epoch": 3.0978381374722836, "grad_norm": 0.0, "kl": 54498467840.0, "learning_rate": 1.6012198041687748e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11177 }, { "completion_length": 593.5, "epoch": 3.0981152993348116, "grad_norm": 0.0, "kl": 0.15867005288600922, "learning_rate": 1.6008112968919448e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11178 }, { "completion_length": 659.0, "epoch": 3.0983924611973395, "grad_norm": 0.0, "kl": 0.16235612332820892, "learning_rate": 1.6004028171894913e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11179 }, { "completion_length": 512.0, "epoch": 3.098669623059867, "grad_norm": 1.288847804069519, "kl": 6323657768960.0, "learning_rate": 1.5999943650739419e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11180 }, { "completion_length": 541.5, "epoch": 3.098946784922395, "grad_norm": 0.0, "kl": 0.19993893802165985, "learning_rate": 1.5995859405578209e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11181 }, { "completion_length": 543.25, "epoch": 3.0992239467849223, "grad_norm": 0.0, "kl": 0.2092924267053604, "learning_rate": 1.5991775436536532e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11182 }, { "completion_length": 603.0, "epoch": 3.09950110864745, "grad_norm": 0.0, "kl": 0.17390961945056915, "learning_rate": 1.5987691743739636e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11183 }, { "completion_length": 584.25, "epoch": 3.0997782705099777, "grad_norm": 0.0, "kl": 3076431.75, "learning_rate": 1.5983608327312739e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11184 }, { "completion_length": 555.75, "epoch": 3.1000554323725056, "grad_norm": 0.0, "kl": 0.2397003471851349, "learning_rate": 1.5979525187381074e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11185 }, { "completion_length": 527.25, "epoch": 3.100332594235033, "grad_norm": 0.0, "kl": 0.22786524891853333, "learning_rate": 1.5975442324069833e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11186 }, { "completion_length": 545.25, "epoch": 3.100609756097561, "grad_norm": 0.0, "kl": 92.76679229736328, "learning_rate": 1.5971359737504253e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11187 }, { "completion_length": 469.0, "epoch": 3.100886917960089, "grad_norm": 0.446701318025589, "kl": 4333315179937792.0, "learning_rate": 1.5967277427809497e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11188 }, { "completion_length": 568.5, "epoch": 3.1011640798226163, "grad_norm": 0.0, "kl": 0.23927263915538788, "learning_rate": 1.596319539511077e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11189 }, { "completion_length": 937.25, "epoch": 3.1014412416851442, "grad_norm": 0.0, "kl": 0.14850974082946777, "learning_rate": 1.5959113639533253e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11190 }, { "completion_length": 568.25, "epoch": 3.1017184035476717, "grad_norm": 0.0, "kl": 0.16311785578727722, "learning_rate": 1.5955032161202105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11191 }, { "completion_length": 606.0, "epoch": 3.1019955654101996, "grad_norm": 0.0, "kl": 0.1546083241701126, "learning_rate": 1.5950950960242501e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11192 }, { "completion_length": 528.75, "epoch": 3.102272727272727, "grad_norm": 0.0, "kl": 0.2481645941734314, "learning_rate": 1.5946870036779585e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11193 }, { "completion_length": 612.5, "epoch": 3.102549889135255, "grad_norm": 0.0, "kl": 0.17084498703479767, "learning_rate": 1.5942789390938507e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11194 }, { "completion_length": 534.0, "epoch": 3.102827050997783, "grad_norm": 0.0, "kl": 0.200605571269989, "learning_rate": 1.59387090228444e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11195 }, { "completion_length": 635.0, "epoch": 3.1031042128603104, "grad_norm": 0.0, "kl": 0.3841780722141266, "learning_rate": 1.5934628932622395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11196 }, { "completion_length": 554.0, "epoch": 3.1033813747228383, "grad_norm": 0.0, "kl": 0.18376636505126953, "learning_rate": 1.5930549120397601e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11197 }, { "completion_length": 658.25, "epoch": 3.1036585365853657, "grad_norm": 0.0, "kl": 1682252522061824.0, "learning_rate": 1.592646958629514e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11198 }, { "completion_length": 619.5, "epoch": 3.1039356984478936, "grad_norm": 0.0, "kl": 0.26238688826560974, "learning_rate": 1.592239033044012e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11199 }, { "completion_length": 505.5, "epoch": 3.104212860310421, "grad_norm": 4.72060489654541, "kl": 1.5528976097214464e+16, "learning_rate": 1.5918311352957622e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11200 }, { "completion_length": 516.0, "epoch": 3.104490022172949, "grad_norm": 0.0, "kl": 0.18003913760185242, "learning_rate": 1.5914232653972738e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11201 }, { "completion_length": 601.5, "epoch": 3.104767184035477, "grad_norm": 0.0, "kl": 57354.546875, "learning_rate": 1.591015423361054e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11202 }, { "completion_length": 585.5, "epoch": 3.1050443458980044, "grad_norm": 0.0, "kl": 0.1758366823196411, "learning_rate": 1.5906076091996103e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11203 }, { "completion_length": 518.0, "epoch": 3.1053215077605323, "grad_norm": 0.0, "kl": 0.19772310554981232, "learning_rate": 1.590199822925448e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11204 }, { "completion_length": 615.5, "epoch": 3.1055986696230597, "grad_norm": 0.0, "kl": 0.18422500789165497, "learning_rate": 1.5897920645510714e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11205 }, { "completion_length": 691.25, "epoch": 3.1058758314855877, "grad_norm": 0.0, "kl": 0.1684565246105194, "learning_rate": 1.5893843340889875e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11206 }, { "completion_length": 593.5, "epoch": 3.106152993348115, "grad_norm": 4.916304588317871, "kl": 799379.375, "learning_rate": 1.588976631551697e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11207 }, { "completion_length": 557.25, "epoch": 3.106430155210643, "grad_norm": 0.0, "kl": 0.2238595336675644, "learning_rate": 1.5885689569517044e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11208 }, { "completion_length": 584.25, "epoch": 3.106707317073171, "grad_norm": 0.0, "kl": 0.1709284633398056, "learning_rate": 1.588161310301509e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11209 }, { "completion_length": 681.5, "epoch": 3.1069844789356984, "grad_norm": 0.0, "kl": 0.16220863163471222, "learning_rate": 1.587753691613614e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11210 }, { "completion_length": 551.25, "epoch": 3.1072616407982263, "grad_norm": 0.42632246017456055, "kl": 0.18988122045993805, "learning_rate": 1.5873461009005182e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11211 }, { "completion_length": 543.75, "epoch": 3.1075388026607538, "grad_norm": 0.0, "kl": 0.25792941451072693, "learning_rate": 1.586938538174721e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11212 }, { "completion_length": 493.75, "epoch": 3.1078159645232817, "grad_norm": 0.0, "kl": 0.22557905316352844, "learning_rate": 1.5865310034487209e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11213 }, { "completion_length": 662.0, "epoch": 3.108093126385809, "grad_norm": 0.0, "kl": 0.1441531777381897, "learning_rate": 1.5861234967350142e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11214 }, { "completion_length": 531.0, "epoch": 3.108370288248337, "grad_norm": 0.0, "kl": 0.48122432827949524, "learning_rate": 1.5857160180460993e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11215 }, { "completion_length": 614.25, "epoch": 3.1086474501108645, "grad_norm": 0.0, "kl": 0.16991882026195526, "learning_rate": 1.5853085673944695e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11216 }, { "completion_length": 595.75, "epoch": 3.1089246119733924, "grad_norm": 0.0, "kl": 0.1985023319721222, "learning_rate": 1.5849011447926225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11217 }, { "completion_length": 508.0, "epoch": 3.1092017738359203, "grad_norm": 0.0, "kl": 0.17983628809452057, "learning_rate": 1.5844937502530488e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11218 }, { "completion_length": 633.5, "epoch": 3.109478935698448, "grad_norm": 0.3105217516422272, "kl": 0.14663557708263397, "learning_rate": 1.5840863837882443e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11219 }, { "completion_length": 591.75, "epoch": 3.1097560975609757, "grad_norm": 0.0, "kl": 0.1823974996805191, "learning_rate": 1.5836790454107007e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11220 }, { "completion_length": 539.75, "epoch": 3.110033259423503, "grad_norm": 0.0, "kl": 0.21722616255283356, "learning_rate": 1.5832717351329086e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11221 }, { "completion_length": 656.25, "epoch": 3.110310421286031, "grad_norm": 0.0, "kl": 0.15949048101902008, "learning_rate": 1.5828644529673592e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11222 }, { "completion_length": 521.0, "epoch": 3.1105875831485585, "grad_norm": 0.0, "kl": 0.17841887474060059, "learning_rate": 1.5824571989265414e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11223 }, { "completion_length": 666.0, "epoch": 3.1108647450110865, "grad_norm": 0.0, "kl": 0.15104128420352936, "learning_rate": 1.582049973022945e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11224 }, { "completion_length": 637.25, "epoch": 3.1111419068736144, "grad_norm": 0.0, "kl": 10.453760147094727, "learning_rate": 1.5816427752690567e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11225 }, { "completion_length": 634.0, "epoch": 3.111419068736142, "grad_norm": 0.0, "kl": 0.15508046746253967, "learning_rate": 1.5812356056773647e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11226 }, { "completion_length": 593.25, "epoch": 3.1116962305986697, "grad_norm": 0.0, "kl": 0.17042316496372223, "learning_rate": 1.5808284642603545e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11227 }, { "completion_length": 585.25, "epoch": 3.111973392461197, "grad_norm": 0.0, "kl": 0.260222852230072, "learning_rate": 1.5804213510305105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11228 }, { "completion_length": 642.25, "epoch": 3.112250554323725, "grad_norm": 0.0, "kl": 0.1813891977071762, "learning_rate": 1.5800142660003197e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11229 }, { "completion_length": 562.0, "epoch": 3.1125277161862526, "grad_norm": 0.0, "kl": 0.2066383957862854, "learning_rate": 1.579607209182264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11230 }, { "completion_length": 573.25, "epoch": 3.1128048780487805, "grad_norm": 0.0, "kl": 0.2811133563518524, "learning_rate": 1.5792001805888269e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11231 }, { "completion_length": 562.0, "epoch": 3.1130820399113084, "grad_norm": 0.0, "kl": 0.236404150724411, "learning_rate": 1.578793180232489e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11232 }, { "completion_length": 566.75, "epoch": 3.113359201773836, "grad_norm": 0.0, "kl": 0.1923331320285797, "learning_rate": 1.578386208125733e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11233 }, { "completion_length": 620.75, "epoch": 3.1136363636363638, "grad_norm": 0.0, "kl": 1509691520.0, "learning_rate": 1.5779792642810376e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11234 }, { "completion_length": 566.0, "epoch": 3.113913525498891, "grad_norm": 0.0, "kl": 11642813440.0, "learning_rate": 1.5775723487108821e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11235 }, { "completion_length": 551.0, "epoch": 3.114190687361419, "grad_norm": 0.46140342950820923, "kl": 1.9738836468760576e+16, "learning_rate": 1.577165461427747e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11236 }, { "completion_length": 677.5, "epoch": 3.1144678492239466, "grad_norm": 0.0, "kl": 0.1746644377708435, "learning_rate": 1.5767586024441066e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11237 }, { "completion_length": 639.75, "epoch": 3.1147450110864745, "grad_norm": 0.0, "kl": 0.17607015371322632, "learning_rate": 1.576351771772441e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11238 }, { "completion_length": 563.0, "epoch": 3.1150221729490024, "grad_norm": 0.0, "kl": 0.1645885556936264, "learning_rate": 1.5759449694252226e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11239 }, { "completion_length": 666.75, "epoch": 3.11529933481153, "grad_norm": 0.0, "kl": 0.1455984264612198, "learning_rate": 1.5755381954149296e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11240 }, { "completion_length": 565.75, "epoch": 3.115576496674058, "grad_norm": 0.0, "kl": 0.20236623287200928, "learning_rate": 1.5751314497540332e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11241 }, { "completion_length": 533.5, "epoch": 3.1158536585365852, "grad_norm": 0.0, "kl": 0.23474504053592682, "learning_rate": 1.5747247324550084e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11242 }, { "completion_length": 654.75, "epoch": 3.116130820399113, "grad_norm": 0.0, "kl": 0.2316112220287323, "learning_rate": 1.5743180435303274e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11243 }, { "completion_length": 658.75, "epoch": 3.1164079822616406, "grad_norm": 0.40232881903648376, "kl": 79735742464.0, "learning_rate": 1.5739113829924605e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11244 }, { "completion_length": 604.75, "epoch": 3.1166851441241685, "grad_norm": 0.0, "kl": 0.15449050068855286, "learning_rate": 1.5735047508538799e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11245 }, { "completion_length": 585.75, "epoch": 3.1169623059866964, "grad_norm": 0.0, "kl": 0.20709556341171265, "learning_rate": 1.5730981471270534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11246 }, { "completion_length": 607.75, "epoch": 3.117239467849224, "grad_norm": 1.3074735403060913, "kl": 9341207552.0, "learning_rate": 1.5726915718244517e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11247 }, { "completion_length": 633.75, "epoch": 3.117516629711752, "grad_norm": 3.542144298553467, "kl": 7632863100928.0, "learning_rate": 1.5722850249585415e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11248 }, { "completion_length": 551.25, "epoch": 3.1177937915742793, "grad_norm": 0.0, "kl": 0.39151516556739807, "learning_rate": 1.5718785065417901e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11249 }, { "completion_length": 661.5, "epoch": 3.118070953436807, "grad_norm": 0.0, "kl": 0.14471875131130219, "learning_rate": 1.571472016586665e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11250 }, { "completion_length": 561.25, "epoch": 3.1183481152993346, "grad_norm": 0.0, "kl": 0.19390612840652466, "learning_rate": 1.57106555510563e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11251 }, { "completion_length": 616.25, "epoch": 3.1186252771618626, "grad_norm": 0.0, "kl": 0.15688456594944, "learning_rate": 1.5706591221111506e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11252 }, { "completion_length": 536.5, "epoch": 3.1189024390243905, "grad_norm": 0.5508245229721069, "kl": 1.3198410878287872e+16, "learning_rate": 1.5702527176156895e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11253 }, { "completion_length": 663.75, "epoch": 3.119179600886918, "grad_norm": 0.0, "kl": 0.1759069561958313, "learning_rate": 1.5698463416317107e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11254 }, { "completion_length": 653.75, "epoch": 3.119456762749446, "grad_norm": 0.0, "kl": 0.7290492653846741, "learning_rate": 1.5694399941716743e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11255 }, { "completion_length": 594.75, "epoch": 3.1197339246119733, "grad_norm": 0.0, "kl": 0.1705370396375656, "learning_rate": 1.5690336752480428e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11256 }, { "completion_length": 499.25, "epoch": 3.120011086474501, "grad_norm": 0.0, "kl": 0.1795557141304016, "learning_rate": 1.5686273848732752e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11257 }, { "completion_length": 568.5, "epoch": 3.1202882483370287, "grad_norm": 0.0, "kl": 0.18182696402072906, "learning_rate": 1.568221123059831e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11258 }, { "completion_length": 592.5, "epoch": 3.1205654101995566, "grad_norm": 0.4287635087966919, "kl": 2.2089485813756723e+17, "learning_rate": 1.56781488982017e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11259 }, { "completion_length": 565.0, "epoch": 3.1208425720620845, "grad_norm": 0.37183451652526855, "kl": 0.18421544134616852, "learning_rate": 1.5674086851667474e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11260 }, { "completion_length": 553.0, "epoch": 3.121119733924612, "grad_norm": 0.0, "kl": 0.758518397808075, "learning_rate": 1.5670025091120219e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11261 }, { "completion_length": 626.5, "epoch": 3.12139689578714, "grad_norm": 0.0, "kl": 0.18914484977722168, "learning_rate": 1.5665963616684477e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11262 }, { "completion_length": 598.5, "epoch": 3.1216740576496673, "grad_norm": 0.0, "kl": 0.18206089735031128, "learning_rate": 1.566190242848481e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11263 }, { "completion_length": 541.75, "epoch": 3.1219512195121952, "grad_norm": 0.0, "kl": 0.1850106567144394, "learning_rate": 1.565784152664574e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11264 }, { "completion_length": 616.25, "epoch": 3.1222283813747227, "grad_norm": 0.0, "kl": 0.17788006365299225, "learning_rate": 1.5653780911291811e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11265 }, { "completion_length": 602.0, "epoch": 3.1225055432372506, "grad_norm": 0.0, "kl": 0.4339056611061096, "learning_rate": 1.5649720582547546e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11266 }, { "completion_length": 643.25, "epoch": 3.122782705099778, "grad_norm": 0.40840038657188416, "kl": 1.1959994424701747e+17, "learning_rate": 1.5645660540537444e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11267 }, { "completion_length": 557.5, "epoch": 3.123059866962306, "grad_norm": 0.0, "kl": 0.17845988273620605, "learning_rate": 1.5641600785386037e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11268 }, { "completion_length": 470.75, "epoch": 3.123337028824834, "grad_norm": 0.0, "kl": 0.3229719400405884, "learning_rate": 1.563754131721779e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11269 }, { "completion_length": 561.75, "epoch": 3.1236141906873613, "grad_norm": 0.0, "kl": 0.18315652012825012, "learning_rate": 1.5633482136157216e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11270 }, { "completion_length": 550.25, "epoch": 3.1238913525498893, "grad_norm": 0.0, "kl": 0.1929135024547577, "learning_rate": 1.5629423242328768e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11271 }, { "completion_length": 633.25, "epoch": 3.1241685144124167, "grad_norm": 0.0, "kl": 0.4694008529186249, "learning_rate": 1.5625364635856933e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11272 }, { "completion_length": 534.75, "epoch": 3.1244456762749446, "grad_norm": 0.0, "kl": 0.17928588390350342, "learning_rate": 1.5621306316866169e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11273 }, { "completion_length": 644.5, "epoch": 3.124722838137472, "grad_norm": 0.0, "kl": 0.16263729333877563, "learning_rate": 1.5617248285480924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11274 }, { "completion_length": 566.75, "epoch": 3.125, "grad_norm": 0.0, "kl": 0.1644495278596878, "learning_rate": 1.5613190541825649e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11275 }, { "completion_length": 556.25, "epoch": 3.125277161862528, "grad_norm": 0.0, "kl": 1426005109506048.0, "learning_rate": 1.5609133086024763e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11276 }, { "completion_length": 535.75, "epoch": 3.1255543237250554, "grad_norm": 0.0, "kl": 0.16027209162712097, "learning_rate": 1.5605075918202712e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11277 }, { "completion_length": 530.5, "epoch": 3.1258314855875833, "grad_norm": 0.0, "kl": 0.19567424058914185, "learning_rate": 1.560101903848389e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11278 }, { "completion_length": 588.25, "epoch": 3.1261086474501107, "grad_norm": 0.0, "kl": 0.18485310673713684, "learning_rate": 1.559696244699271e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11279 }, { "completion_length": 453.5, "epoch": 3.1263858093126387, "grad_norm": 0.0, "kl": 0.28004202246665955, "learning_rate": 1.559290614385359e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11280 }, { "completion_length": 547.5, "epoch": 3.126662971175166, "grad_norm": 0.0, "kl": 0.2040269523859024, "learning_rate": 1.5588850129190897e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11281 }, { "completion_length": 610.75, "epoch": 3.126940133037694, "grad_norm": 0.0, "kl": 2970473201664000.0, "learning_rate": 1.5584794403129028e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11282 }, { "completion_length": 579.25, "epoch": 3.127217294900222, "grad_norm": 0.0, "kl": 0.17436054348945618, "learning_rate": 1.5580738965792345e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11283 }, { "completion_length": 545.0, "epoch": 3.1274944567627494, "grad_norm": 0.0, "kl": 0.24483974277973175, "learning_rate": 1.5576683817305216e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11284 }, { "completion_length": 589.5, "epoch": 3.1277716186252773, "grad_norm": 0.0, "kl": 0.1924356073141098, "learning_rate": 1.557262895779199e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11285 }, { "completion_length": 610.75, "epoch": 3.1280487804878048, "grad_norm": 0.0, "kl": 0.17922383546829224, "learning_rate": 1.5568574387377016e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11286 }, { "completion_length": 493.25, "epoch": 3.1283259423503327, "grad_norm": 0.0, "kl": 0.19469614326953888, "learning_rate": 1.5564520106184643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11287 }, { "completion_length": 580.25, "epoch": 3.12860310421286, "grad_norm": 0.0, "kl": 0.19839969277381897, "learning_rate": 1.5560466114339174e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11288 }, { "completion_length": 604.5, "epoch": 3.128880266075388, "grad_norm": 0.0, "kl": 356407904.0, "learning_rate": 1.5556412411964957e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11289 }, { "completion_length": 679.75, "epoch": 3.1291574279379155, "grad_norm": 0.0, "kl": 0.15108174085617065, "learning_rate": 1.5552358999186274e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11290 }, { "completion_length": 681.5, "epoch": 3.1294345898004434, "grad_norm": 0.0, "kl": 0.29371383786201477, "learning_rate": 1.5548305876127445e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11291 }, { "completion_length": 513.25, "epoch": 3.1297117516629713, "grad_norm": 0.4216030240058899, "kl": 0.22829186916351318, "learning_rate": 1.5544253042912756e-06, "loss": 0.0, "reward": 5.0, "reward_std": 1.5, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11292 }, { "completion_length": 510.5, "epoch": 3.129988913525499, "grad_norm": 0.0, "kl": 0.18644922971725464, "learning_rate": 1.554020049966649e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11293 }, { "completion_length": 612.5, "epoch": 3.1302660753880267, "grad_norm": 0.0, "kl": 0.20469091832637787, "learning_rate": 1.5536148246512921e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11294 }, { "completion_length": 588.0, "epoch": 3.130543237250554, "grad_norm": 0.0, "kl": 0.17664039134979248, "learning_rate": 1.553209628357632e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11295 }, { "completion_length": 524.75, "epoch": 3.130820399113082, "grad_norm": 0.5088614225387573, "kl": 1210926837530624.0, "learning_rate": 1.5528044610980942e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11296 }, { "completion_length": 772.0, "epoch": 3.1310975609756095, "grad_norm": 0.0, "kl": 0.11591028422117233, "learning_rate": 1.552399322885103e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11297 }, { "completion_length": 687.0, "epoch": 3.1313747228381374, "grad_norm": 0.0, "kl": 0.16772150993347168, "learning_rate": 1.5519942137310828e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11298 }, { "completion_length": 675.75, "epoch": 3.1316518847006654, "grad_norm": 0.3673241436481476, "kl": 0.19536784291267395, "learning_rate": 1.551589133648456e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11299 }, { "completion_length": 659.25, "epoch": 3.131929046563193, "grad_norm": 0.450461745262146, "kl": 4.940111756551782e+16, "learning_rate": 1.5511840826496462e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11300 }, { "completion_length": 585.5, "epoch": 3.1322062084257207, "grad_norm": 0.3702390193939209, "kl": 0.6777312755584717, "learning_rate": 1.5507790607470724e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11301 }, { "completion_length": 585.0, "epoch": 3.132483370288248, "grad_norm": 0.0, "kl": 0.16447722911834717, "learning_rate": 1.5503740679531565e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11302 }, { "completion_length": 451.5, "epoch": 3.132760532150776, "grad_norm": 0.0, "kl": 0.18841153383255005, "learning_rate": 1.5499691042803183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11303 }, { "completion_length": 513.0, "epoch": 3.1330376940133036, "grad_norm": 0.0, "kl": 0.19946755468845367, "learning_rate": 1.549564169740975e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11304 }, { "completion_length": 607.5, "epoch": 3.1333148558758315, "grad_norm": 0.45851588249206543, "kl": 0.2628057301044464, "learning_rate": 1.5491592643475454e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11305 }, { "completion_length": 556.75, "epoch": 3.1335920177383594, "grad_norm": 0.0, "kl": 0.18137039244174957, "learning_rate": 1.5487543881124451e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11306 }, { "completion_length": 606.75, "epoch": 3.133869179600887, "grad_norm": 0.41898855566978455, "kl": 0.16596832871437073, "learning_rate": 1.5483495410480915e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11307 }, { "completion_length": 566.75, "epoch": 3.1341463414634148, "grad_norm": 0.0, "kl": 0.21588315069675446, "learning_rate": 1.5479447231668978e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11308 }, { "completion_length": 592.0, "epoch": 3.134423503325942, "grad_norm": 0.0, "kl": 0.2053963840007782, "learning_rate": 1.5475399344812786e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11309 }, { "completion_length": 582.25, "epoch": 3.13470066518847, "grad_norm": 0.0, "kl": 0.9171933531761169, "learning_rate": 1.5471351750036486e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11310 }, { "completion_length": 543.5, "epoch": 3.1349778270509976, "grad_norm": 0.0, "kl": 0.22660161554813385, "learning_rate": 1.5467304447464188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11311 }, { "completion_length": 698.5, "epoch": 3.1352549889135255, "grad_norm": 0.0, "kl": 0.17584222555160522, "learning_rate": 1.5463257437220007e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11312 }, { "completion_length": 580.75, "epoch": 3.1355321507760534, "grad_norm": 0.0, "kl": 0.21252572536468506, "learning_rate": 1.5459210719428044e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11313 }, { "completion_length": 499.5, "epoch": 3.135809312638581, "grad_norm": 0.0, "kl": 0.17347005009651184, "learning_rate": 1.5455164294212406e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11314 }, { "completion_length": 527.75, "epoch": 3.136086474501109, "grad_norm": 0.0, "kl": 0.42823758721351624, "learning_rate": 1.5451118161697172e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11315 }, { "completion_length": 575.5, "epoch": 3.1363636363636362, "grad_norm": 0.5152109861373901, "kl": 0.18882635235786438, "learning_rate": 1.5447072322006415e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11316 }, { "completion_length": 580.5, "epoch": 3.136640798226164, "grad_norm": 0.0, "kl": 0.23122040927410126, "learning_rate": 1.544302677526422e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11317 }, { "completion_length": 477.5, "epoch": 3.1369179600886916, "grad_norm": 0.0, "kl": 0.18215009570121765, "learning_rate": 1.5438981521594627e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11318 }, { "completion_length": 634.25, "epoch": 3.1371951219512195, "grad_norm": 0.0, "kl": 0.19023407995700836, "learning_rate": 1.543493656112171e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11319 }, { "completion_length": 686.25, "epoch": 3.1374722838137474, "grad_norm": 0.0, "kl": 118653689266176.0, "learning_rate": 1.5430891893969488e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11320 }, { "completion_length": 455.75, "epoch": 3.137749445676275, "grad_norm": 0.44150152802467346, "kl": 480887603200.0, "learning_rate": 1.5426847520262017e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11321 }, { "completion_length": 566.0, "epoch": 3.138026607538803, "grad_norm": 0.0, "kl": 0.18653646111488342, "learning_rate": 1.5422803440123297e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11322 }, { "completion_length": 510.5, "epoch": 3.1383037694013303, "grad_norm": 2.631450653076172, "kl": 2819667.25, "learning_rate": 1.5418759653677361e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11323 }, { "completion_length": 612.0, "epoch": 3.138580931263858, "grad_norm": 0.0, "kl": 35.959381103515625, "learning_rate": 1.5414716161048209e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11324 }, { "completion_length": 627.75, "epoch": 3.1388580931263856, "grad_norm": 0.0, "kl": 0.1683906614780426, "learning_rate": 1.5410672962359841e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11325 }, { "completion_length": 605.25, "epoch": 3.1391352549889135, "grad_norm": 9.791945457458496, "kl": 0.3510739505290985, "learning_rate": 1.540663005773624e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11326 }, { "completion_length": 568.25, "epoch": 3.1394124168514415, "grad_norm": 0.0, "kl": 21046806528.0, "learning_rate": 1.5402587447301387e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11327 }, { "completion_length": 589.25, "epoch": 3.139689578713969, "grad_norm": 0.0, "kl": 0.2010968029499054, "learning_rate": 1.539854513117926e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11328 }, { "completion_length": 549.0, "epoch": 3.139966740576497, "grad_norm": 0.0, "kl": 0.23477980494499207, "learning_rate": 1.5394503109493801e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11329 }, { "completion_length": 616.5, "epoch": 3.1402439024390243, "grad_norm": 0.0, "kl": 0.1595965474843979, "learning_rate": 1.5390461382368985e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11330 }, { "completion_length": 558.75, "epoch": 3.140521064301552, "grad_norm": 0.0, "kl": 0.19147469103336334, "learning_rate": 1.5386419949928732e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11331 }, { "completion_length": 532.25, "epoch": 3.1407982261640797, "grad_norm": 0.0, "kl": 0.290067583322525, "learning_rate": 1.538237881229699e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11332 }, { "completion_length": 692.75, "epoch": 3.1410753880266076, "grad_norm": 0.0, "kl": 0.15277989208698273, "learning_rate": 1.537833796959769e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11333 }, { "completion_length": 578.75, "epoch": 3.1413525498891355, "grad_norm": 0.0, "kl": 0.2244119793176651, "learning_rate": 1.5374297421954732e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11334 }, { "completion_length": 453.75, "epoch": 3.141629711751663, "grad_norm": 0.0, "kl": 0.20163223147392273, "learning_rate": 1.5370257169492037e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11335 }, { "completion_length": 465.5, "epoch": 3.141906873614191, "grad_norm": 0.0, "kl": 0.22225593030452728, "learning_rate": 1.536621721233349e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11336 }, { "completion_length": 591.0, "epoch": 3.1421840354767183, "grad_norm": 0.0, "kl": 0.18909570574760437, "learning_rate": 1.5362177550602991e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11337 }, { "completion_length": 523.75, "epoch": 3.1424611973392462, "grad_norm": 0.0, "kl": 0.18946458399295807, "learning_rate": 1.535813818442441e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11338 }, { "completion_length": 640.75, "epoch": 3.1427383592017737, "grad_norm": 0.0, "kl": 0.17514966428279877, "learning_rate": 1.5354099113921614e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11339 }, { "completion_length": 661.5, "epoch": 3.1430155210643016, "grad_norm": 0.39105474948883057, "kl": 8.864648431193293e+16, "learning_rate": 1.5350060339218487e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11340 }, { "completion_length": 588.5, "epoch": 3.143292682926829, "grad_norm": 0.0, "kl": 0.22553718090057373, "learning_rate": 1.5346021860438854e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11341 }, { "completion_length": 592.25, "epoch": 3.143569844789357, "grad_norm": 0.0, "kl": 0.36972999572753906, "learning_rate": 1.5341983677706579e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11342 }, { "completion_length": 538.0, "epoch": 3.143847006651885, "grad_norm": 0.0, "kl": 0.22079448401927948, "learning_rate": 1.5337945791145487e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11343 }, { "completion_length": 627.5, "epoch": 3.1441241685144123, "grad_norm": 0.0, "kl": 0.18593734502792358, "learning_rate": 1.5333908200879404e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11344 }, { "completion_length": 505.25, "epoch": 3.1444013303769403, "grad_norm": 0.0, "kl": 0.2099422663450241, "learning_rate": 1.5329870907032145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11345 }, { "completion_length": 547.0, "epoch": 3.1446784922394677, "grad_norm": 5.412545204162598, "kl": 1.0264879495643136e+16, "learning_rate": 1.532583390972752e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11346 }, { "completion_length": 612.75, "epoch": 3.1449556541019956, "grad_norm": 0.45310521125793457, "kl": 1.020102620124817, "learning_rate": 1.5321797209089329e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11347 }, { "completion_length": 484.25, "epoch": 3.145232815964523, "grad_norm": 0.5443623065948486, "kl": 4.534681876194918e+16, "learning_rate": 1.531776080524135e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11348 }, { "completion_length": 531.75, "epoch": 3.145509977827051, "grad_norm": 0.0, "kl": 0.17585578560829163, "learning_rate": 1.5313724698307378e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11349 }, { "completion_length": 571.0, "epoch": 3.145787139689579, "grad_norm": 3.391674757003784, "kl": 185898381606912.0, "learning_rate": 1.5309688888411165e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11350 }, { "completion_length": 669.0, "epoch": 3.1460643015521064, "grad_norm": 0.0, "kl": 2.0684536727732224e+16, "learning_rate": 1.5305653375676497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11351 }, { "completion_length": 554.25, "epoch": 3.1463414634146343, "grad_norm": 0.0, "kl": 0.28056931495666504, "learning_rate": 1.5301618160227098e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11352 }, { "completion_length": 608.25, "epoch": 3.1466186252771617, "grad_norm": 0.0, "kl": 0.4096727669239044, "learning_rate": 1.5297583242186742e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11353 }, { "completion_length": 525.25, "epoch": 3.1468957871396896, "grad_norm": 0.0, "kl": 0.20138685405254364, "learning_rate": 1.5293548621679127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11354 }, { "completion_length": 600.75, "epoch": 3.147172949002217, "grad_norm": 0.41848450899124146, "kl": 1.126082442952704e+17, "learning_rate": 1.5289514298828006e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11355 }, { "completion_length": 571.5, "epoch": 3.147450110864745, "grad_norm": 0.46388810873031616, "kl": 3.290736296460288e+16, "learning_rate": 1.5285480273757094e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11356 }, { "completion_length": 544.5, "epoch": 3.147727272727273, "grad_norm": 0.0, "kl": 0.251577228307724, "learning_rate": 1.5281446546590084e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11357 }, { "completion_length": 565.0, "epoch": 3.1480044345898004, "grad_norm": 0.0, "kl": 0.21133951842784882, "learning_rate": 1.5277413117450684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11358 }, { "completion_length": 487.5, "epoch": 3.1482815964523283, "grad_norm": 0.0, "kl": 0.21514776349067688, "learning_rate": 1.5273379986462577e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11359 }, { "completion_length": 612.0, "epoch": 3.1485587583148558, "grad_norm": 0.0, "kl": 0.17908862233161926, "learning_rate": 1.5269347153749448e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11360 }, { "completion_length": 545.5, "epoch": 3.1488359201773837, "grad_norm": 0.0, "kl": 0.21320341527462006, "learning_rate": 1.5265314619434952e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11361 }, { "completion_length": 665.75, "epoch": 3.149113082039911, "grad_norm": 0.0, "kl": 508589888.0, "learning_rate": 1.5261282383642767e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11362 }, { "completion_length": 537.5, "epoch": 3.149390243902439, "grad_norm": 0.0, "kl": 0.4573397934436798, "learning_rate": 1.525725044649654e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11363 }, { "completion_length": 673.5, "epoch": 3.1496674057649665, "grad_norm": 0.0, "kl": 0.404054194688797, "learning_rate": 1.5253218808119912e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11364 }, { "completion_length": 553.75, "epoch": 3.1499445676274944, "grad_norm": 12.410557746887207, "kl": 22608.607421875, "learning_rate": 1.5249187468636526e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11365 }, { "completion_length": 631.5, "epoch": 3.1502217294900223, "grad_norm": 6.068153381347656, "kl": 3133830400.0, "learning_rate": 1.5245156428169988e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11366 }, { "completion_length": 639.75, "epoch": 3.15049889135255, "grad_norm": 0.0, "kl": 0.16090959310531616, "learning_rate": 1.524112568684393e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11367 }, { "completion_length": 613.75, "epoch": 3.1507760532150777, "grad_norm": 0.5368179082870483, "kl": 0.16497240960597992, "learning_rate": 1.5237095244781947e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11368 }, { "completion_length": 578.25, "epoch": 3.151053215077605, "grad_norm": 0.0, "kl": 0.20337678492069244, "learning_rate": 1.5233065102107634e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11369 }, { "completion_length": 506.5, "epoch": 3.151330376940133, "grad_norm": 4.616301536560059, "kl": 22978236416.0, "learning_rate": 1.5229035258944598e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11370 }, { "completion_length": 525.75, "epoch": 3.1516075388026605, "grad_norm": 0.0, "kl": 0.18784330785274506, "learning_rate": 1.5225005715416391e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11371 }, { "completion_length": 532.75, "epoch": 3.1518847006651884, "grad_norm": 0.0, "kl": 0.17734090983867645, "learning_rate": 1.5220976471646616e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11372 }, { "completion_length": 545.75, "epoch": 3.1521618625277164, "grad_norm": 0.0, "kl": 0.19890297949314117, "learning_rate": 1.521694752775879e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11373 }, { "completion_length": 512.25, "epoch": 3.152439024390244, "grad_norm": 0.0, "kl": 0.16888998448848724, "learning_rate": 1.5212918883876499e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11374 }, { "completion_length": 574.5, "epoch": 3.1527161862527717, "grad_norm": 0.0, "kl": 0.16725018620491028, "learning_rate": 1.5208890540123267e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11375 }, { "completion_length": 650.0, "epoch": 3.152993348115299, "grad_norm": 0.0, "kl": 0.15874141454696655, "learning_rate": 1.5204862496622636e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11376 }, { "completion_length": 545.5, "epoch": 3.153270509977827, "grad_norm": 6.108431339263916, "kl": 87908376576.0, "learning_rate": 1.520083475349813e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11377 }, { "completion_length": 514.5, "epoch": 3.1535476718403546, "grad_norm": 0.0, "kl": 0.20995327830314636, "learning_rate": 1.519680731087325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11378 }, { "completion_length": 635.25, "epoch": 3.1538248337028825, "grad_norm": 0.3653087615966797, "kl": 0.18504054844379425, "learning_rate": 1.5192780168871515e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11379 }, { "completion_length": 575.75, "epoch": 3.1541019955654104, "grad_norm": 0.0, "kl": 0.20313052833080292, "learning_rate": 1.5188753327616407e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11380 }, { "completion_length": 519.25, "epoch": 3.154379157427938, "grad_norm": 0.0, "kl": 0.22195476293563843, "learning_rate": 1.5184726787231435e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11381 }, { "completion_length": 623.25, "epoch": 3.1546563192904657, "grad_norm": 0.39549970626831055, "kl": 124908847104.0, "learning_rate": 1.5180700547840043e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11382 }, { "completion_length": 635.0, "epoch": 3.154933481152993, "grad_norm": 0.3719675838947296, "kl": 7.036086720764314e+16, "learning_rate": 1.5176674609565727e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11383 }, { "completion_length": 674.0, "epoch": 3.155210643015521, "grad_norm": 0.0, "kl": 0.2072163075208664, "learning_rate": 1.5172648972531946e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11384 }, { "completion_length": 652.5, "epoch": 3.1554878048780486, "grad_norm": 0.0, "kl": 0.1657588928937912, "learning_rate": 1.516862363686213e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11385 }, { "completion_length": 623.75, "epoch": 3.1557649667405765, "grad_norm": 0.0, "kl": 0.16168446838855743, "learning_rate": 1.5164598602679734e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11386 }, { "completion_length": 617.25, "epoch": 3.1560421286031044, "grad_norm": 0.0, "kl": 0.18959523737430573, "learning_rate": 1.516057387010818e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11387 }, { "completion_length": 558.25, "epoch": 3.156319290465632, "grad_norm": 0.0, "kl": 0.21283167600631714, "learning_rate": 1.5156549439270901e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11388 }, { "completion_length": 668.75, "epoch": 3.1565964523281598, "grad_norm": 0.0, "kl": 0.15606501698493958, "learning_rate": 1.5152525310291299e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11389 }, { "completion_length": 595.25, "epoch": 3.1568736141906872, "grad_norm": 0.0, "kl": 0.17918896675109863, "learning_rate": 1.5148501483292782e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11390 }, { "completion_length": 574.0, "epoch": 3.157150776053215, "grad_norm": 3.0292110443115234, "kl": 69071020032.0, "learning_rate": 1.514447795839874e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11391 }, { "completion_length": 578.25, "epoch": 3.1574279379157426, "grad_norm": 0.0, "kl": 0.22288565337657928, "learning_rate": 1.5140454735732557e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11392 }, { "completion_length": 630.5, "epoch": 3.1577050997782705, "grad_norm": 0.0, "kl": 0.18670101463794708, "learning_rate": 1.5136431815417623e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11393 }, { "completion_length": 647.5, "epoch": 3.1579822616407984, "grad_norm": 0.0, "kl": 0.21126914024353027, "learning_rate": 1.513240919757729e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11394 }, { "completion_length": 593.25, "epoch": 3.158259423503326, "grad_norm": 0.0, "kl": 0.1414036601781845, "learning_rate": 1.5128386882334921e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11395 }, { "completion_length": 580.25, "epoch": 3.158536585365854, "grad_norm": 0.0, "kl": 512280887296.0, "learning_rate": 1.5124364869813856e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11396 }, { "completion_length": 685.5, "epoch": 3.1588137472283813, "grad_norm": 0.0, "kl": 0.2050992101430893, "learning_rate": 1.5120343160137447e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11397 }, { "completion_length": 562.75, "epoch": 3.159090909090909, "grad_norm": 0.0, "kl": 0.2129010707139969, "learning_rate": 1.511632175342901e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11398 }, { "completion_length": 563.75, "epoch": 3.1593680709534366, "grad_norm": 0.0, "kl": 0.17563432455062866, "learning_rate": 1.5112300649811867e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11399 }, { "completion_length": 570.5, "epoch": 3.1596452328159645, "grad_norm": 0.0, "kl": 0.20679281651973724, "learning_rate": 1.5108279849409336e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11400 }, { "completion_length": 557.25, "epoch": 3.1599223946784925, "grad_norm": 0.0, "kl": 0.22175505757331848, "learning_rate": 1.5104259352344705e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11401 }, { "completion_length": 629.5, "epoch": 3.16019955654102, "grad_norm": 0.0, "kl": 5.161930496514458e+16, "learning_rate": 1.5100239158741289e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11402 }, { "completion_length": 610.75, "epoch": 3.160476718403548, "grad_norm": 0.0, "kl": 0.17686018347740173, "learning_rate": 1.509621926872234e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11403 }, { "completion_length": 598.25, "epoch": 3.1607538802660753, "grad_norm": 0.0, "kl": 0.19425159692764282, "learning_rate": 1.5092199682411163e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11404 }, { "completion_length": 547.0, "epoch": 3.161031042128603, "grad_norm": 0.0, "kl": 0.17492417991161346, "learning_rate": 1.5088180399930993e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11405 }, { "completion_length": 610.25, "epoch": 3.1613082039911307, "grad_norm": 0.0, "kl": 0.1604551076889038, "learning_rate": 1.5084161421405102e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11406 }, { "completion_length": 600.5, "epoch": 3.1615853658536586, "grad_norm": 0.0, "kl": 0.15822364389896393, "learning_rate": 1.5080142746956736e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11407 }, { "completion_length": 582.0, "epoch": 3.1618625277161865, "grad_norm": 0.0, "kl": 0.20003370940685272, "learning_rate": 1.5076124376709122e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11408 }, { "completion_length": 543.75, "epoch": 3.162139689578714, "grad_norm": 0.3956129550933838, "kl": 0.19064687192440033, "learning_rate": 1.5072106310785495e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11409 }, { "completion_length": 600.0, "epoch": 3.162416851441242, "grad_norm": 0.0, "kl": 0.24984584748744965, "learning_rate": 1.5068088549309062e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11410 }, { "completion_length": 519.75, "epoch": 3.1626940133037693, "grad_norm": 0.0, "kl": 0.16387131810188293, "learning_rate": 1.5064071092403047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11411 }, { "completion_length": 617.75, "epoch": 3.162971175166297, "grad_norm": 0.0, "kl": 6.153164710268109e+16, "learning_rate": 1.5060053940190624e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11412 }, { "completion_length": 648.0, "epoch": 3.1632483370288247, "grad_norm": 0.0, "kl": 0.17242328822612762, "learning_rate": 1.5056037092795006e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11413 }, { "completion_length": 597.5, "epoch": 3.1635254988913526, "grad_norm": 0.0, "kl": 0.17131437361240387, "learning_rate": 1.505202055033937e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11414 }, { "completion_length": 569.5, "epoch": 3.16380266075388, "grad_norm": 0.43079042434692383, "kl": 0.17093370854854584, "learning_rate": 1.5048004312946874e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11415 }, { "completion_length": 618.25, "epoch": 3.164079822616408, "grad_norm": 0.0, "kl": 0.1852603405714035, "learning_rate": 1.5043988380740692e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11416 }, { "completion_length": 552.75, "epoch": 3.164356984478936, "grad_norm": 0.0, "kl": 0.21278080344200134, "learning_rate": 1.5039972753843966e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11417 }, { "completion_length": 678.75, "epoch": 3.1646341463414633, "grad_norm": 0.0, "kl": 0.20741645991802216, "learning_rate": 1.5035957432379847e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11418 }, { "completion_length": 629.0, "epoch": 3.1649113082039912, "grad_norm": 0.3827488422393799, "kl": 314559922176.0, "learning_rate": 1.503194241647146e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11419 }, { "completion_length": 545.5, "epoch": 3.1651884700665187, "grad_norm": 1.0281070470809937, "kl": 402237075488768.0, "learning_rate": 1.5027927706241937e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11420 }, { "completion_length": 990.0, "epoch": 3.1654656319290466, "grad_norm": 4.797922611236572, "kl": 2681481984.0, "learning_rate": 1.5023913301814386e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11421 }, { "completion_length": 607.0, "epoch": 3.165742793791574, "grad_norm": 9.504222869873047, "kl": 0.1502462923526764, "learning_rate": 1.5019899203311906e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11422 }, { "completion_length": 571.5, "epoch": 3.166019955654102, "grad_norm": 0.0, "kl": 0.1721271127462387, "learning_rate": 1.5015885410857617e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11423 }, { "completion_length": 578.5, "epoch": 3.16629711751663, "grad_norm": 0.364515483379364, "kl": 8.074227217250386e+17, "learning_rate": 1.5011871924574578e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11424 }, { "completion_length": 612.25, "epoch": 3.1665742793791574, "grad_norm": 0.0, "kl": 0.2702440619468689, "learning_rate": 1.5007858744585884e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11425 }, { "completion_length": 526.75, "epoch": 3.1668514412416853, "grad_norm": 0.0, "kl": 0.21183159947395325, "learning_rate": 1.500384587101459e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11426 }, { "completion_length": 543.25, "epoch": 3.1671286031042127, "grad_norm": 0.0, "kl": 0.1881849467754364, "learning_rate": 1.4999833303983769e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11427 }, { "completion_length": 484.25, "epoch": 3.1674057649667406, "grad_norm": 0.6740252375602722, "kl": 46354305024.0, "learning_rate": 1.4995821043616455e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11428 }, { "completion_length": 634.0, "epoch": 3.167682926829268, "grad_norm": 1.1296534538269043, "kl": 7655494126665728.0, "learning_rate": 1.4991809090035693e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11429 }, { "completion_length": 528.5, "epoch": 3.167960088691796, "grad_norm": 0.4227604866027832, "kl": 2.461097864986624e+17, "learning_rate": 1.4987797443364516e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11430 }, { "completion_length": 919.75, "epoch": 3.168237250554324, "grad_norm": 0.2812713086605072, "kl": 0.1641296148300171, "learning_rate": 1.4983786103725933e-06, "loss": -0.0, "reward": 5.59375, "reward_std": 0.3125, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.59375, "step": 11431 }, { "completion_length": 627.25, "epoch": 3.1685144124168514, "grad_norm": 0.4029838740825653, "kl": 0.15265177190303802, "learning_rate": 1.497977507124298e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11432 }, { "completion_length": 574.75, "epoch": 3.1687915742793793, "grad_norm": 0.0, "kl": 0.22232937812805176, "learning_rate": 1.4975764346038624e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11433 }, { "completion_length": 546.25, "epoch": 3.1690687361419068, "grad_norm": 0.0, "kl": 0.20665201544761658, "learning_rate": 1.4971753928235893e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11434 }, { "completion_length": 490.0, "epoch": 3.1693458980044347, "grad_norm": 0.0, "kl": 0.20438581705093384, "learning_rate": 1.4967743817957738e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11435 }, { "completion_length": 511.25, "epoch": 3.169623059866962, "grad_norm": 0.0, "kl": 0.20383240282535553, "learning_rate": 1.4963734015327153e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11436 }, { "completion_length": 564.25, "epoch": 3.16990022172949, "grad_norm": 0.3526436686515808, "kl": 0.21603043377399445, "learning_rate": 1.49597245204671e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11437 }, { "completion_length": 605.5, "epoch": 3.1701773835920175, "grad_norm": 0.0, "kl": 0.19560354948043823, "learning_rate": 1.4955715333500523e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11438 }, { "completion_length": 573.25, "epoch": 3.1704545454545454, "grad_norm": 0.0, "kl": 0.2716740071773529, "learning_rate": 1.495170645455038e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11439 }, { "completion_length": 634.75, "epoch": 3.1707317073170733, "grad_norm": 0.0, "kl": 0.19651861488819122, "learning_rate": 1.4947697883739593e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11440 }, { "completion_length": 520.75, "epoch": 3.171008869179601, "grad_norm": 0.0, "kl": 0.23376093804836273, "learning_rate": 1.4943689621191105e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11441 }, { "completion_length": 519.25, "epoch": 3.1712860310421287, "grad_norm": 0.0, "kl": 0.17594116926193237, "learning_rate": 1.493968166702781e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11442 }, { "completion_length": 533.0, "epoch": 3.171563192904656, "grad_norm": 0.0, "kl": 0.20380964875221252, "learning_rate": 1.493567402137263e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11443 }, { "completion_length": 572.25, "epoch": 3.171840354767184, "grad_norm": 0.0, "kl": 0.318422794342041, "learning_rate": 1.4931666684348472e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11444 }, { "completion_length": 536.25, "epoch": 3.1721175166297115, "grad_norm": 0.5404022932052612, "kl": 2.1965111463604388e+18, "learning_rate": 1.4927659656078205e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11445 }, { "completion_length": 518.25, "epoch": 3.1723946784922394, "grad_norm": 0.0, "kl": 0.2033778727054596, "learning_rate": 1.4923652936684724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11446 }, { "completion_length": 984.0, "epoch": 3.1726718403547673, "grad_norm": 0.0, "kl": 0.13716977834701538, "learning_rate": 1.4919646526290884e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11447 }, { "completion_length": 523.5, "epoch": 3.172949002217295, "grad_norm": 0.0, "kl": 0.1605343073606491, "learning_rate": 1.4915640425019556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11448 }, { "completion_length": 617.25, "epoch": 3.1732261640798227, "grad_norm": 0.0, "kl": 0.1683272272348404, "learning_rate": 1.4911634632993583e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11449 }, { "completion_length": 616.75, "epoch": 3.17350332594235, "grad_norm": 0.0, "kl": 0.29240882396698, "learning_rate": 1.4907629150335817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11450 }, { "completion_length": 547.75, "epoch": 3.173780487804878, "grad_norm": 0.0, "kl": 0.210469588637352, "learning_rate": 1.4903623977169073e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11451 }, { "completion_length": 548.25, "epoch": 3.1740576496674056, "grad_norm": 0.0, "kl": 0.19892682135105133, "learning_rate": 1.4899619113616176e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11452 }, { "completion_length": 612.5, "epoch": 3.1743348115299335, "grad_norm": 0.0, "kl": 0.16594402492046356, "learning_rate": 1.4895614559799959e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11453 }, { "completion_length": 544.75, "epoch": 3.1746119733924614, "grad_norm": 0.0, "kl": 0.20027486979961395, "learning_rate": 1.48916103158432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11454 }, { "completion_length": 750.5, "epoch": 3.174889135254989, "grad_norm": 0.0, "kl": 0.16265909373760223, "learning_rate": 1.4887606381868707e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11455 }, { "completion_length": 539.5, "epoch": 3.1751662971175167, "grad_norm": 0.0, "kl": 38161111580672.0, "learning_rate": 1.488360275799926e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11456 }, { "completion_length": 625.75, "epoch": 3.175443458980044, "grad_norm": 0.0, "kl": 0.187949076294899, "learning_rate": 1.4879599444357635e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11457 }, { "completion_length": 501.0, "epoch": 3.175720620842572, "grad_norm": 0.0, "kl": 0.24157288670539856, "learning_rate": 1.4875596441066591e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11458 }, { "completion_length": 560.0, "epoch": 3.1759977827050996, "grad_norm": 0.0, "kl": 0.39195334911346436, "learning_rate": 1.4871593748248887e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11459 }, { "completion_length": 466.5, "epoch": 3.1762749445676275, "grad_norm": 0.0, "kl": 0.20055972039699554, "learning_rate": 1.4867591366027276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11460 }, { "completion_length": 568.75, "epoch": 3.1765521064301554, "grad_norm": 0.0, "kl": 0.192732572555542, "learning_rate": 1.4863589294524482e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11461 }, { "completion_length": 580.5, "epoch": 3.176829268292683, "grad_norm": 0.0, "kl": 0.18442362546920776, "learning_rate": 1.4859587533863243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11462 }, { "completion_length": 540.75, "epoch": 3.1771064301552108, "grad_norm": 0.4252954423427582, "kl": 4307241402368.0, "learning_rate": 1.4855586084166263e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11463 }, { "completion_length": 491.75, "epoch": 3.1773835920177382, "grad_norm": 0.0, "kl": 0.21022683382034302, "learning_rate": 1.485158494555627e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11464 }, { "completion_length": 546.25, "epoch": 3.177660753880266, "grad_norm": 0.0, "kl": 0.20320093631744385, "learning_rate": 1.4847584118155938e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11465 }, { "completion_length": 582.75, "epoch": 3.1779379157427936, "grad_norm": 0.0, "kl": 0.18622083961963654, "learning_rate": 1.484358360208797e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11466 }, { "completion_length": 615.0, "epoch": 3.1782150776053215, "grad_norm": 0.0, "kl": 0.1563500165939331, "learning_rate": 1.483958339747505e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11467 }, { "completion_length": 503.75, "epoch": 3.1784922394678494, "grad_norm": 0.0, "kl": 0.24513351917266846, "learning_rate": 1.4835583504439837e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11468 }, { "completion_length": 559.75, "epoch": 3.178769401330377, "grad_norm": 0.37901026010513306, "kl": 7386142081024.0, "learning_rate": 1.4831583923105e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11469 }, { "completion_length": 735.0, "epoch": 3.179046563192905, "grad_norm": 0.0, "kl": 0.1518486738204956, "learning_rate": 1.482758465359318e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11470 }, { "completion_length": 542.25, "epoch": 3.1793237250554323, "grad_norm": 0.0, "kl": 0.17565058171749115, "learning_rate": 1.4823585696027026e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11471 }, { "completion_length": 611.25, "epoch": 3.17960088691796, "grad_norm": 0.36860406398773193, "kl": 281369542656.0, "learning_rate": 1.481958705052916e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11472 }, { "completion_length": 491.75, "epoch": 3.1798780487804876, "grad_norm": 2.717402219772339, "kl": 10448516.0, "learning_rate": 1.4815588717222207e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11473 }, { "completion_length": 668.0, "epoch": 3.1801552106430155, "grad_norm": 6.028847694396973, "kl": 105994108928.0, "learning_rate": 1.4811590696228794e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11474 }, { "completion_length": 640.75, "epoch": 3.1804323725055434, "grad_norm": 0.0, "kl": 0.18341569602489471, "learning_rate": 1.48075929876715e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11475 }, { "completion_length": 603.5, "epoch": 3.180709534368071, "grad_norm": 0.0, "kl": 0.18460744619369507, "learning_rate": 1.4803595591672937e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11476 }, { "completion_length": 576.25, "epoch": 3.180986696230599, "grad_norm": 0.0, "kl": 0.16808506846427917, "learning_rate": 1.4799598508355678e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11477 }, { "completion_length": 465.75, "epoch": 3.1812638580931263, "grad_norm": 0.0, "kl": 0.23698295652866364, "learning_rate": 1.4795601737842302e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11478 }, { "completion_length": 580.0, "epoch": 3.181541019955654, "grad_norm": 0.0, "kl": 0.19381745159626007, "learning_rate": 1.479160528025537e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11479 }, { "completion_length": 690.25, "epoch": 3.1818181818181817, "grad_norm": 0.0, "kl": 0.25103965401649475, "learning_rate": 1.4787609135717436e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11480 }, { "completion_length": 545.0, "epoch": 3.1820953436807096, "grad_norm": 0.0, "kl": 9.596034976055296e+16, "learning_rate": 1.4783613304351052e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11481 }, { "completion_length": 585.75, "epoch": 3.1823725055432375, "grad_norm": 0.0, "kl": 0.20720447599887848, "learning_rate": 1.4779617786278743e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11482 }, { "completion_length": 579.75, "epoch": 3.182649667405765, "grad_norm": 0.0, "kl": 0.17611323297023773, "learning_rate": 1.477562258162305e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11483 }, { "completion_length": 628.5, "epoch": 3.182926829268293, "grad_norm": 0.0, "kl": 0.1969890147447586, "learning_rate": 1.4771627690506468e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11484 }, { "completion_length": 630.5, "epoch": 3.1832039911308203, "grad_norm": 0.0, "kl": 0.1535976231098175, "learning_rate": 1.4767633113051534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11485 }, { "completion_length": 590.0, "epoch": 3.183481152993348, "grad_norm": 0.0, "kl": 0.16541758179664612, "learning_rate": 1.4763638849380708e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11486 }, { "completion_length": 862.25, "epoch": 3.1837583148558757, "grad_norm": 2.0546891689300537, "kl": 1.7100094461444096e+17, "learning_rate": 1.4759644899616504e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11487 }, { "completion_length": 607.0, "epoch": 3.1840354767184036, "grad_norm": 11.697380065917969, "kl": 18172227584.0, "learning_rate": 1.4755651263881393e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11488 }, { "completion_length": 509.25, "epoch": 3.1843126385809315, "grad_norm": 0.0, "kl": 0.180620476603508, "learning_rate": 1.4751657942297836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11489 }, { "completion_length": 550.5, "epoch": 3.184589800443459, "grad_norm": 0.0, "kl": 0.16789382696151733, "learning_rate": 1.474766493498831e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11490 }, { "completion_length": 615.0, "epoch": 3.184866962305987, "grad_norm": 0.0, "kl": 0.16501392424106598, "learning_rate": 1.474367224207524e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11491 }, { "completion_length": 544.0, "epoch": 3.1851441241685143, "grad_norm": 0.35548603534698486, "kl": 654875164672.0, "learning_rate": 1.4739679863681086e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11492 }, { "completion_length": 514.25, "epoch": 3.1854212860310422, "grad_norm": 0.0, "kl": 0.20483101904392242, "learning_rate": 1.473568779992826e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11493 }, { "completion_length": 556.25, "epoch": 3.1856984478935697, "grad_norm": 0.0, "kl": 0.2592533528804779, "learning_rate": 1.4731696050939202e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11494 }, { "completion_length": 620.25, "epoch": 3.1859756097560976, "grad_norm": 0.0, "kl": 0.16308626532554626, "learning_rate": 1.4727704616836297e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11495 }, { "completion_length": 529.5, "epoch": 3.186252771618625, "grad_norm": 0.5394864678382874, "kl": 0.28856727480888367, "learning_rate": 1.4723713497741964e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11496 }, { "completion_length": 557.5, "epoch": 3.186529933481153, "grad_norm": 0.0, "kl": 0.18798762559890747, "learning_rate": 1.4719722693778599e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11497 }, { "completion_length": 554.5, "epoch": 3.186807095343681, "grad_norm": 0.0, "kl": 0.17994147539138794, "learning_rate": 1.4715732205068563e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11498 }, { "completion_length": 574.25, "epoch": 3.1870842572062084, "grad_norm": 0.0, "kl": 0.17586538195610046, "learning_rate": 1.4711742031734246e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11499 }, { "completion_length": 600.75, "epoch": 3.1873614190687363, "grad_norm": 0.0, "kl": 0.20815037190914154, "learning_rate": 1.4707752173897999e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11500 }, { "completion_length": 599.75, "epoch": 3.1876385809312637, "grad_norm": 0.0, "kl": 0.15791408717632294, "learning_rate": 1.4703762631682183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11501 }, { "completion_length": 575.25, "epoch": 3.1879157427937916, "grad_norm": 0.0, "kl": 0.2536623477935791, "learning_rate": 1.469977340520913e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11502 }, { "completion_length": 539.25, "epoch": 3.188192904656319, "grad_norm": 0.0, "kl": 0.20518095791339874, "learning_rate": 1.4695784494601176e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11503 }, { "completion_length": 624.0, "epoch": 3.188470066518847, "grad_norm": 0.0, "kl": 0.2584402561187744, "learning_rate": 1.469179589998066e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11504 }, { "completion_length": 594.5, "epoch": 3.188747228381375, "grad_norm": 0.0, "kl": 0.17874743044376373, "learning_rate": 1.4687807621469868e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11505 }, { "completion_length": 626.5, "epoch": 3.1890243902439024, "grad_norm": 0.0, "kl": 3.5533473902598554e+17, "learning_rate": 1.4683819659191128e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11506 }, { "completion_length": 569.25, "epoch": 3.1893015521064303, "grad_norm": 0.0, "kl": 0.1770009696483612, "learning_rate": 1.4679832013266721e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11507 }, { "completion_length": 460.5, "epoch": 3.1895787139689578, "grad_norm": 0.0, "kl": 0.22484098374843597, "learning_rate": 1.467584468381894e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11508 }, { "completion_length": 507.0, "epoch": 3.1898558758314857, "grad_norm": 0.39610016345977783, "kl": 8.668522872790057e+17, "learning_rate": 1.467185767097005e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11509 }, { "completion_length": 618.0, "epoch": 3.190133037694013, "grad_norm": 7.462552070617676, "kl": 344533664.0, "learning_rate": 1.4667870974842318e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11510 }, { "completion_length": 617.75, "epoch": 3.190410199556541, "grad_norm": 0.0, "kl": 0.15906496345996857, "learning_rate": 1.4663884595558014e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11511 }, { "completion_length": 521.5, "epoch": 3.1906873614190685, "grad_norm": 0.0, "kl": 0.9606918096542358, "learning_rate": 1.4659898533239365e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11512 }, { "completion_length": 618.5, "epoch": 3.1909645232815964, "grad_norm": 0.0, "kl": 0.34637320041656494, "learning_rate": 1.4655912788008617e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11513 }, { "completion_length": 545.0, "epoch": 3.1912416851441243, "grad_norm": 0.0, "kl": 0.17833195626735687, "learning_rate": 1.4651927359987984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11514 }, { "completion_length": 593.75, "epoch": 3.191518847006652, "grad_norm": 0.4067300260066986, "kl": 723412779008.0, "learning_rate": 1.4647942249299708e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11515 }, { "completion_length": 591.75, "epoch": 3.1917960088691797, "grad_norm": 0.0, "kl": 1.820206642150879, "learning_rate": 1.4643957456065965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11516 }, { "completion_length": 599.75, "epoch": 3.192073170731707, "grad_norm": 0.0, "kl": 0.1576002687215805, "learning_rate": 1.463997298040898e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11517 }, { "completion_length": 556.0, "epoch": 3.192350332594235, "grad_norm": 0.0, "kl": 0.18647697567939758, "learning_rate": 1.4635988822450913e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11518 }, { "completion_length": 597.75, "epoch": 3.1926274944567625, "grad_norm": 0.0, "kl": 0.18123304843902588, "learning_rate": 1.4632004982313964e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11519 }, { "completion_length": 486.5, "epoch": 3.1929046563192904, "grad_norm": 0.0, "kl": 8.187982559204102, "learning_rate": 1.4628021460120289e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11520 }, { "completion_length": 517.0, "epoch": 3.1931818181818183, "grad_norm": 0.0, "kl": 0.9102304577827454, "learning_rate": 1.462403825599205e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11521 }, { "completion_length": 611.5, "epoch": 3.193458980044346, "grad_norm": 0.0, "kl": 0.16978567838668823, "learning_rate": 1.46200553700514e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11522 }, { "completion_length": 539.0, "epoch": 3.1937361419068737, "grad_norm": 0.0, "kl": 0.21034428477287292, "learning_rate": 1.4616072802420467e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11523 }, { "completion_length": 464.5, "epoch": 3.194013303769401, "grad_norm": 0.0, "kl": 0.1939900666475296, "learning_rate": 1.4612090553221391e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11524 }, { "completion_length": 590.75, "epoch": 3.194290465631929, "grad_norm": 0.0, "kl": 0.24925167858600616, "learning_rate": 1.4608108622576273e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11525 }, { "completion_length": 532.25, "epoch": 3.1945676274944566, "grad_norm": 1.4032683372497559, "kl": 1.479192234300539e+17, "learning_rate": 1.460412701060724e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11526 }, { "completion_length": 664.5, "epoch": 3.1948447893569845, "grad_norm": 0.0, "kl": 0.14827874302864075, "learning_rate": 1.4600145717436393e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11527 }, { "completion_length": 677.0, "epoch": 3.1951219512195124, "grad_norm": 0.0, "kl": 0.14898905158042908, "learning_rate": 1.4596164743185812e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11528 }, { "completion_length": 603.0, "epoch": 3.19539911308204, "grad_norm": 0.0, "kl": 0.19123975932598114, "learning_rate": 1.4592184087977579e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11529 }, { "completion_length": 550.25, "epoch": 3.1956762749445677, "grad_norm": 7.028131484985352, "kl": 25119.021484375, "learning_rate": 1.4588203751933755e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11530 }, { "completion_length": 624.75, "epoch": 3.195953436807095, "grad_norm": 0.0, "kl": 0.4790095090866089, "learning_rate": 1.4584223735176422e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11531 }, { "completion_length": 635.75, "epoch": 3.196230598669623, "grad_norm": 0.4069105386734009, "kl": 88881725440.0, "learning_rate": 1.4580244037827607e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11532 }, { "completion_length": 561.75, "epoch": 3.1965077605321506, "grad_norm": 0.0, "kl": 0.17414066195487976, "learning_rate": 1.4576264660009371e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11533 }, { "completion_length": 536.75, "epoch": 3.1967849223946785, "grad_norm": 0.0, "kl": 0.24892345070838928, "learning_rate": 1.457228560184374e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11534 }, { "completion_length": 665.75, "epoch": 3.1970620842572064, "grad_norm": 0.41890662908554077, "kl": 4.315568364333302e+17, "learning_rate": 1.456830686345272e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11535 }, { "completion_length": 672.5, "epoch": 3.197339246119734, "grad_norm": 0.0, "kl": 0.1575782150030136, "learning_rate": 1.4564328444958343e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11536 }, { "completion_length": 519.75, "epoch": 3.1976164079822618, "grad_norm": 0.0, "kl": 0.33977222442626953, "learning_rate": 1.4560350346482599e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11537 }, { "completion_length": 545.75, "epoch": 3.1978935698447892, "grad_norm": 0.0, "kl": 0.17230933904647827, "learning_rate": 1.4556372568147484e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11538 }, { "completion_length": 550.25, "epoch": 3.198170731707317, "grad_norm": 0.0, "kl": 0.18819329142570496, "learning_rate": 1.4552395110074968e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11539 }, { "completion_length": 488.25, "epoch": 3.1984478935698446, "grad_norm": 0.0, "kl": 0.22654752433300018, "learning_rate": 1.454841797238703e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11540 }, { "completion_length": 592.0, "epoch": 3.1987250554323725, "grad_norm": 0.40246328711509705, "kl": 8.863196732247245e+16, "learning_rate": 1.454444115520565e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11541 }, { "completion_length": 565.5, "epoch": 3.1990022172949004, "grad_norm": 16.28550910949707, "kl": 3.72027325630188, "learning_rate": 1.4540464658652764e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11542 }, { "completion_length": 662.0, "epoch": 3.199279379157428, "grad_norm": 0.0, "kl": 0.1521267145872116, "learning_rate": 1.4536488482850316e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11543 }, { "completion_length": 627.5, "epoch": 3.199556541019956, "grad_norm": 0.0, "kl": 0.23005852103233337, "learning_rate": 1.4532512627920232e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11544 }, { "completion_length": 562.75, "epoch": 3.1998337028824833, "grad_norm": 0.0, "kl": 0.19004955887794495, "learning_rate": 1.4528537093984445e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11545 }, { "completion_length": 601.5, "epoch": 3.200110864745011, "grad_norm": 7.220178604125977, "kl": 0.17809924483299255, "learning_rate": 1.4524561881164873e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11546 }, { "completion_length": 565.25, "epoch": 3.2003880266075386, "grad_norm": 0.7136737108230591, "kl": 2.0983236584884142e+18, "learning_rate": 1.4520586989583406e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11547 }, { "completion_length": 568.5, "epoch": 3.2006651884700665, "grad_norm": 0.0, "kl": 0.21180705726146698, "learning_rate": 1.4516612419361937e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11548 }, { "completion_length": 610.0, "epoch": 3.2009423503325944, "grad_norm": 0.0, "kl": 0.18040065467357635, "learning_rate": 1.451263817062235e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11549 }, { "completion_length": 512.0, "epoch": 3.201219512195122, "grad_norm": 0.0, "kl": 0.21531875431537628, "learning_rate": 1.4508664243486548e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11550 }, { "completion_length": 499.75, "epoch": 3.20149667405765, "grad_norm": 0.0, "kl": 0.17333325743675232, "learning_rate": 1.450469063807635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11551 }, { "completion_length": 609.5, "epoch": 3.2017738359201773, "grad_norm": 0.0, "kl": 0.18531253933906555, "learning_rate": 1.450071735451364e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11552 }, { "completion_length": 958.75, "epoch": 3.202050997782705, "grad_norm": 0.22959741950035095, "kl": 0.2932845950126648, "learning_rate": 1.4496744392920241e-06, "loss": 0.0, "reward": 4.09375, "reward_std": 3.3125, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.59375, "step": 11553 }, { "completion_length": 523.75, "epoch": 3.2023281596452327, "grad_norm": 0.0, "kl": 2.252732276916504, "learning_rate": 1.449277175341801e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11554 }, { "completion_length": 578.25, "epoch": 3.2026053215077606, "grad_norm": 0.3785002529621124, "kl": 0.15926364064216614, "learning_rate": 1.4488799436128764e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11555 }, { "completion_length": 622.5, "epoch": 3.2028824833702885, "grad_norm": 10.70295524597168, "kl": 344208.28125, "learning_rate": 1.4484827441174302e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11556 }, { "completion_length": 555.25, "epoch": 3.203159645232816, "grad_norm": 0.5568183660507202, "kl": 5.300449376563364e+17, "learning_rate": 1.448085576867645e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11557 }, { "completion_length": 502.75, "epoch": 3.203436807095344, "grad_norm": 0.0, "kl": 0.2340017408132553, "learning_rate": 1.4476884418756993e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11558 }, { "completion_length": 605.25, "epoch": 3.2037139689578713, "grad_norm": 0.0, "kl": 0.16594673693180084, "learning_rate": 1.4472913391537713e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11559 }, { "completion_length": 515.5, "epoch": 3.203991130820399, "grad_norm": 0.0, "kl": 0.2175334393978119, "learning_rate": 1.446894268714038e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11560 }, { "completion_length": 575.75, "epoch": 3.2042682926829267, "grad_norm": 0.0, "kl": 0.22159387171268463, "learning_rate": 1.4464972305686778e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11561 }, { "completion_length": 547.25, "epoch": 3.2045454545454546, "grad_norm": 0.0, "kl": 0.15872542560100555, "learning_rate": 1.446100224729864e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11562 }, { "completion_length": 600.0, "epoch": 3.2048226164079825, "grad_norm": 0.0, "kl": 0.16712257266044617, "learning_rate": 1.445703251209773e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11563 }, { "completion_length": 509.5, "epoch": 3.20509977827051, "grad_norm": 0.48466750979423523, "kl": 9.978863793607803e+17, "learning_rate": 1.4453063100205778e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11564 }, { "completion_length": 684.25, "epoch": 3.205376940133038, "grad_norm": 0.0, "kl": 0.19629421830177307, "learning_rate": 1.4449094011744494e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11565 }, { "completion_length": 554.0, "epoch": 3.2056541019955653, "grad_norm": 0.0, "kl": 0.1841580718755722, "learning_rate": 1.4445125246835617e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11566 }, { "completion_length": 538.5, "epoch": 3.2059312638580932, "grad_norm": 0.0, "kl": 0.20279604196548462, "learning_rate": 1.4441156805600842e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11567 }, { "completion_length": 586.75, "epoch": 3.2062084257206207, "grad_norm": 0.0, "kl": 0.1617913842201233, "learning_rate": 1.4437188688161862e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11568 }, { "completion_length": 512.25, "epoch": 3.2064855875831486, "grad_norm": 0.0, "kl": 0.23240163922309875, "learning_rate": 1.443322089464036e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11569 }, { "completion_length": 563.25, "epoch": 3.206762749445676, "grad_norm": 0.0, "kl": 0.1401042342185974, "learning_rate": 1.4429253425158014e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11570 }, { "completion_length": 566.25, "epoch": 3.207039911308204, "grad_norm": 0.0, "kl": 0.2565285265445709, "learning_rate": 1.4425286279836508e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11571 }, { "completion_length": 639.0, "epoch": 3.207317073170732, "grad_norm": 0.0, "kl": 0.16533921658992767, "learning_rate": 1.4421319458797467e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11572 }, { "completion_length": 586.0, "epoch": 3.2075942350332594, "grad_norm": 0.0, "kl": 0.15782667696475983, "learning_rate": 1.441735296216256e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11573 }, { "completion_length": 551.75, "epoch": 3.2078713968957873, "grad_norm": 0.0, "kl": 0.2088179886341095, "learning_rate": 1.4413386790053404e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11574 }, { "completion_length": 528.75, "epoch": 3.2081485587583147, "grad_norm": 0.0, "kl": 0.185901939868927, "learning_rate": 1.4409420942591648e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11575 }, { "completion_length": 601.0, "epoch": 3.2084257206208426, "grad_norm": 0.0, "kl": 0.1758459061384201, "learning_rate": 1.4405455419898894e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11576 }, { "completion_length": 592.0, "epoch": 3.20870288248337, "grad_norm": 0.0, "kl": 0.2086176872253418, "learning_rate": 1.4401490222096743e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11577 }, { "completion_length": 509.5, "epoch": 3.208980044345898, "grad_norm": 0.0, "kl": 0.19334053993225098, "learning_rate": 1.4397525349306808e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11578 }, { "completion_length": 534.75, "epoch": 3.209257206208426, "grad_norm": 6.2165727615356445, "kl": 9910007889920.0, "learning_rate": 1.439356080165066e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11579 }, { "completion_length": 571.5, "epoch": 3.2095343680709534, "grad_norm": 0.0, "kl": 0.1663769632577896, "learning_rate": 1.4389596579249888e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11580 }, { "completion_length": 555.25, "epoch": 3.2098115299334813, "grad_norm": 2.719473361968994, "kl": 2162778112.0, "learning_rate": 1.4385632682226037e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11581 }, { "completion_length": 678.25, "epoch": 3.2100886917960088, "grad_norm": 1.383569598197937, "kl": 12791476224.0, "learning_rate": 1.4381669110700685e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11582 }, { "completion_length": 471.5, "epoch": 3.2103658536585367, "grad_norm": 0.0, "kl": 0.23642101883888245, "learning_rate": 1.4377705864795361e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11583 }, { "completion_length": 563.0, "epoch": 3.210643015521064, "grad_norm": 0.0, "kl": 0.17739328742027283, "learning_rate": 1.4373742944631622e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11584 }, { "completion_length": 549.25, "epoch": 3.210920177383592, "grad_norm": 0.0, "kl": 0.18253086507320404, "learning_rate": 1.436978035033098e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11585 }, { "completion_length": 525.5, "epoch": 3.21119733924612, "grad_norm": 0.0, "kl": 0.2051496058702469, "learning_rate": 1.4365818082014947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11586 }, { "completion_length": 899.5, "epoch": 3.2114745011086474, "grad_norm": 0.0, "kl": 0.17686568200588226, "learning_rate": 1.4361856139805047e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11587 }, { "completion_length": 587.0, "epoch": 3.2117516629711753, "grad_norm": 0.0, "kl": 0.45313048362731934, "learning_rate": 1.4357894523822763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11588 }, { "completion_length": 637.25, "epoch": 3.212028824833703, "grad_norm": 0.0, "kl": 0.1911790370941162, "learning_rate": 1.4353933234189587e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11589 }, { "completion_length": 581.0, "epoch": 3.2123059866962307, "grad_norm": 0.0, "kl": 0.20958372950553894, "learning_rate": 1.434997227102698e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11590 }, { "completion_length": 619.0, "epoch": 3.212583148558758, "grad_norm": 0.39067041873931885, "kl": 0.1628214716911316, "learning_rate": 1.434601163445643e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11591 }, { "completion_length": 509.5, "epoch": 3.212860310421286, "grad_norm": 0.0, "kl": 0.2858481705188751, "learning_rate": 1.4342051324599376e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11592 }, { "completion_length": 584.0, "epoch": 3.2131374722838135, "grad_norm": 0.0, "kl": 0.16570380330085754, "learning_rate": 1.4338091341577282e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11593 }, { "completion_length": 604.0, "epoch": 3.2134146341463414, "grad_norm": 0.0, "kl": 0.20989222824573517, "learning_rate": 1.4334131685511572e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11594 }, { "completion_length": 467.25, "epoch": 3.2136917960088693, "grad_norm": 3.4687705039978027, "kl": 2258469897895936.0, "learning_rate": 1.4330172356523664e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11595 }, { "completion_length": 526.5, "epoch": 3.213968957871397, "grad_norm": 0.5873833298683167, "kl": 35971018752.0, "learning_rate": 1.4326213354734993e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11596 }, { "completion_length": 610.75, "epoch": 3.2142461197339247, "grad_norm": 0.0, "kl": 0.16158458590507507, "learning_rate": 1.4322254680266962e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11597 }, { "completion_length": 533.5, "epoch": 3.214523281596452, "grad_norm": 0.0, "kl": 0.24372799694538116, "learning_rate": 1.4318296333240956e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11598 }, { "completion_length": 578.0, "epoch": 3.21480044345898, "grad_norm": 0.0, "kl": 0.2019689679145813, "learning_rate": 1.431433831377836e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11599 }, { "completion_length": 625.25, "epoch": 3.2150776053215075, "grad_norm": 0.0, "kl": 0.1423245519399643, "learning_rate": 1.4310380622000556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11600 }, { "completion_length": 512.75, "epoch": 3.2153547671840355, "grad_norm": 0.0, "kl": 0.2255345582962036, "learning_rate": 1.4306423258028928e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11601 }, { "completion_length": 667.0, "epoch": 3.2156319290465634, "grad_norm": 0.0, "kl": 7.771886774019686e+16, "learning_rate": 1.4302466221984802e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11602 }, { "completion_length": 626.5, "epoch": 3.215909090909091, "grad_norm": 0.0, "kl": 0.7814332246780396, "learning_rate": 1.429850951398954e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11603 }, { "completion_length": 568.5, "epoch": 3.2161862527716187, "grad_norm": 0.0, "kl": 0.16918256878852844, "learning_rate": 1.4294553134164463e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11604 }, { "completion_length": 466.0, "epoch": 3.216463414634146, "grad_norm": 10.766766548156738, "kl": 1.1810856839308902e+17, "learning_rate": 1.429059708263092e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11605 }, { "completion_length": 580.75, "epoch": 3.216740576496674, "grad_norm": 3.511364698410034, "kl": 2073224.75, "learning_rate": 1.4286641359510217e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11606 }, { "completion_length": 703.25, "epoch": 3.2170177383592016, "grad_norm": 0.0, "kl": 0.16037367284297943, "learning_rate": 1.4282685964923643e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11607 }, { "completion_length": 589.5, "epoch": 3.2172949002217295, "grad_norm": 0.0, "kl": 0.1621575802564621, "learning_rate": 1.4278730898992521e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11608 }, { "completion_length": 586.5, "epoch": 3.2175720620842574, "grad_norm": 0.0, "kl": 0.18520638346672058, "learning_rate": 1.4274776161838123e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11609 }, { "completion_length": 650.25, "epoch": 3.217849223946785, "grad_norm": 0.0, "kl": 0.15840332210063934, "learning_rate": 1.427082175358172e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11610 }, { "completion_length": 557.25, "epoch": 3.2181263858093128, "grad_norm": 0.0, "kl": 0.16329561173915863, "learning_rate": 1.4266867674344572e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11611 }, { "completion_length": 547.75, "epoch": 3.2184035476718402, "grad_norm": 0.0, "kl": 9314372.0, "learning_rate": 1.4262913924247956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11612 }, { "completion_length": 538.5, "epoch": 3.218680709534368, "grad_norm": 0.0, "kl": 0.20462186634540558, "learning_rate": 1.4258960503413094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11613 }, { "completion_length": 623.25, "epoch": 3.2189578713968956, "grad_norm": 0.0, "kl": 0.17849360406398773, "learning_rate": 1.4255007411961252e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11614 }, { "completion_length": 472.75, "epoch": 3.2192350332594235, "grad_norm": 0.0, "kl": 4.868242350800896e+16, "learning_rate": 1.4251054650013618e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11615 }, { "completion_length": 552.5, "epoch": 3.2195121951219514, "grad_norm": 0.0, "kl": 0.15576449036598206, "learning_rate": 1.424710221769142e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11616 }, { "completion_length": 622.5, "epoch": 3.219789356984479, "grad_norm": 0.0, "kl": 0.18207962810993195, "learning_rate": 1.4243150115115876e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11617 }, { "completion_length": 594.5, "epoch": 3.220066518847007, "grad_norm": 0.0, "kl": 0.2034984827041626, "learning_rate": 1.423919834240817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11618 }, { "completion_length": 546.75, "epoch": 3.2203436807095343, "grad_norm": 0.4172486960887909, "kl": 1.707826852798462, "learning_rate": 1.4235246899689486e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11619 }, { "completion_length": 554.75, "epoch": 3.220620842572062, "grad_norm": 0.0, "kl": 0.18774676322937012, "learning_rate": 1.4231295787080995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11620 }, { "completion_length": 587.75, "epoch": 3.2208980044345896, "grad_norm": 0.0, "kl": 0.18211296200752258, "learning_rate": 1.4227345004703868e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11621 }, { "completion_length": 578.25, "epoch": 3.2211751662971175, "grad_norm": 0.0, "kl": 0.16084836423397064, "learning_rate": 1.4223394552679263e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11622 }, { "completion_length": 597.25, "epoch": 3.2214523281596454, "grad_norm": 0.0, "kl": 0.1817331314086914, "learning_rate": 1.4219444431128304e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11623 }, { "completion_length": 638.75, "epoch": 3.221729490022173, "grad_norm": 0.0, "kl": 0.20671409368515015, "learning_rate": 1.421549464017215e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11624 }, { "completion_length": 534.0, "epoch": 3.222006651884701, "grad_norm": 0.0, "kl": 0.16430220007896423, "learning_rate": 1.42115451799319e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11625 }, { "completion_length": 540.0, "epoch": 3.2222838137472283, "grad_norm": 0.0, "kl": 0.18678364157676697, "learning_rate": 1.4207596050528694e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11626 }, { "completion_length": 470.0, "epoch": 3.222560975609756, "grad_norm": 0.0, "kl": 0.23705552518367767, "learning_rate": 1.4203647252083619e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11627 }, { "completion_length": 633.25, "epoch": 3.2228381374722836, "grad_norm": 1.1784803867340088, "kl": 204902064.0, "learning_rate": 1.4199698784717774e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11628 }, { "completion_length": 585.5, "epoch": 3.2231152993348116, "grad_norm": 0.0, "kl": 0.18781162798404694, "learning_rate": 1.4195750648552226e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11629 }, { "completion_length": 546.25, "epoch": 3.2233924611973395, "grad_norm": 2.6157758235931396, "kl": 5314583552.0, "learning_rate": 1.4191802843708064e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11630 }, { "completion_length": 534.0, "epoch": 3.223669623059867, "grad_norm": 0.0, "kl": 0.16494587063789368, "learning_rate": 1.4187855370306366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11631 }, { "completion_length": 548.5, "epoch": 3.223946784922395, "grad_norm": 0.0, "kl": 0.17491182684898376, "learning_rate": 1.4183908228468146e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11632 }, { "completion_length": 621.25, "epoch": 3.2242239467849223, "grad_norm": 0.0, "kl": 1.0420610904693604, "learning_rate": 1.417996141831448e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11633 }, { "completion_length": 513.0, "epoch": 3.22450110864745, "grad_norm": 0.4409677982330322, "kl": 0.19156497716903687, "learning_rate": 1.4176014939966374e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11634 }, { "completion_length": 632.0, "epoch": 3.2247782705099777, "grad_norm": 0.0, "kl": 2353884206989312.0, "learning_rate": 1.4172068793544873e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11635 }, { "completion_length": 593.25, "epoch": 3.2250554323725056, "grad_norm": 0.0, "kl": 0.20227812230587006, "learning_rate": 1.4168122979170978e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11636 }, { "completion_length": 542.75, "epoch": 3.2253325942350335, "grad_norm": 0.0, "kl": 5.383662048641024e+16, "learning_rate": 1.4164177496965686e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11637 }, { "completion_length": 600.25, "epoch": 3.225609756097561, "grad_norm": 0.0, "kl": 0.1720508188009262, "learning_rate": 1.4160232347050002e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11638 }, { "completion_length": 581.75, "epoch": 3.225886917960089, "grad_norm": 0.0, "kl": 13.054169654846191, "learning_rate": 1.4156287529544898e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11639 }, { "completion_length": 522.25, "epoch": 3.2261640798226163, "grad_norm": 0.0, "kl": 0.473572701215744, "learning_rate": 1.4152343044571347e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11640 }, { "completion_length": 500.75, "epoch": 3.2264412416851442, "grad_norm": 0.0, "kl": 0.7334802746772766, "learning_rate": 1.4148398892250303e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11641 }, { "completion_length": 744.0, "epoch": 3.2267184035476717, "grad_norm": 0.0, "kl": 0.16450989246368408, "learning_rate": 1.4144455072702731e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11642 }, { "completion_length": 524.75, "epoch": 3.2269955654101996, "grad_norm": 0.0, "kl": 0.24480606615543365, "learning_rate": 1.4140511586049554e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11643 }, { "completion_length": 656.75, "epoch": 3.227272727272727, "grad_norm": 0.34621673822402954, "kl": 0.16003872454166412, "learning_rate": 1.4136568432411735e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11644 }, { "completion_length": 541.0, "epoch": 3.227549889135255, "grad_norm": 0.0, "kl": 0.21733996272087097, "learning_rate": 1.4132625611910147e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11645 }, { "completion_length": 563.25, "epoch": 3.227827050997783, "grad_norm": 0.0, "kl": 0.18449845910072327, "learning_rate": 1.4128683124665727e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11646 }, { "completion_length": 642.0, "epoch": 3.2281042128603104, "grad_norm": 0.0, "kl": 0.15360820293426514, "learning_rate": 1.412474097079938e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11647 }, { "completion_length": 558.5, "epoch": 3.2283813747228383, "grad_norm": 0.0, "kl": 0.3285714089870453, "learning_rate": 1.412079915043199e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11648 }, { "completion_length": 631.0, "epoch": 3.2286585365853657, "grad_norm": 0.0, "kl": 0.18984690308570862, "learning_rate": 1.4116857663684432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11649 }, { "completion_length": 672.5, "epoch": 3.2289356984478936, "grad_norm": 0.33373528718948364, "kl": 1.558365867593433e+18, "learning_rate": 1.4112916510677567e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11650 }, { "completion_length": 540.75, "epoch": 3.229212860310421, "grad_norm": 0.0, "kl": 0.18688222765922546, "learning_rate": 1.4108975691532273e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11651 }, { "completion_length": 947.75, "epoch": 3.229490022172949, "grad_norm": 0.0, "kl": 0.136748269200325, "learning_rate": 1.410503520636939e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11652 }, { "completion_length": 560.0, "epoch": 3.229767184035477, "grad_norm": 0.0, "kl": 0.186201810836792, "learning_rate": 1.4101095055309746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11653 }, { "completion_length": 584.25, "epoch": 3.2300443458980044, "grad_norm": 1.3518187999725342, "kl": 56214280994816.0, "learning_rate": 1.4097155238474187e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11654 }, { "completion_length": 589.0, "epoch": 3.2303215077605323, "grad_norm": 0.0, "kl": 0.15358002483844757, "learning_rate": 1.4093215755983513e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11655 }, { "completion_length": 528.25, "epoch": 3.2305986696230597, "grad_norm": 0.0, "kl": 0.32799407839775085, "learning_rate": 1.4089276607958551e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11656 }, { "completion_length": 564.75, "epoch": 3.2308758314855877, "grad_norm": 0.534728467464447, "kl": 1.2373154816694682e+18, "learning_rate": 1.4085337794520087e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11657 }, { "completion_length": 509.25, "epoch": 3.231152993348115, "grad_norm": 0.45767053961753845, "kl": 0.19859068095684052, "learning_rate": 1.4081399315788907e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11658 }, { "completion_length": 574.25, "epoch": 3.231430155210643, "grad_norm": 0.0, "kl": 0.1693916618824005, "learning_rate": 1.4077461171885781e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11659 }, { "completion_length": 543.75, "epoch": 3.231707317073171, "grad_norm": 0.0, "kl": 0.22988925874233246, "learning_rate": 1.4073523362931496e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11660 }, { "completion_length": 521.25, "epoch": 3.2319844789356984, "grad_norm": 0.0, "kl": 0.2077149748802185, "learning_rate": 1.4069585889046796e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11661 }, { "completion_length": 574.25, "epoch": 3.2322616407982263, "grad_norm": 0.0, "kl": 0.16960902512073517, "learning_rate": 1.4065648750352418e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11662 }, { "completion_length": 593.25, "epoch": 3.2325388026607538, "grad_norm": 0.7736440300941467, "kl": 8.548245509121245e+18, "learning_rate": 1.4061711946969115e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11663 }, { "completion_length": 577.25, "epoch": 3.2328159645232817, "grad_norm": 0.0, "kl": 0.33145642280578613, "learning_rate": 1.4057775479017597e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11664 }, { "completion_length": 595.0, "epoch": 3.233093126385809, "grad_norm": 0.0, "kl": 0.17371729016304016, "learning_rate": 1.4053839346618602e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11665 }, { "completion_length": 603.75, "epoch": 3.233370288248337, "grad_norm": 0.0, "kl": 0.19694383442401886, "learning_rate": 1.4049903549892805e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11666 }, { "completion_length": 475.0, "epoch": 3.2336474501108645, "grad_norm": 0.0, "kl": 4367624184004608.0, "learning_rate": 1.404596808896091e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11667 }, { "completion_length": 546.0, "epoch": 3.2339246119733924, "grad_norm": 0.0, "kl": 0.2515968084335327, "learning_rate": 1.404203296394362e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11668 }, { "completion_length": 587.75, "epoch": 3.2342017738359203, "grad_norm": 0.0, "kl": 0.20628157258033752, "learning_rate": 1.4038098174961592e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11669 }, { "completion_length": 618.0, "epoch": 3.234478935698448, "grad_norm": 0.0, "kl": 0.1536647230386734, "learning_rate": 1.4034163722135494e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11670 }, { "completion_length": 547.0, "epoch": 3.2347560975609757, "grad_norm": 0.0, "kl": 0.18690377473831177, "learning_rate": 1.4030229605585969e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11671 }, { "completion_length": 599.25, "epoch": 3.235033259423503, "grad_norm": 0.754040002822876, "kl": 4.638891097194496e+17, "learning_rate": 1.4026295825433677e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11672 }, { "completion_length": 525.25, "epoch": 3.235310421286031, "grad_norm": 0.0, "kl": 0.1948765069246292, "learning_rate": 1.4022362381799243e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11673 }, { "completion_length": 566.5, "epoch": 3.2355875831485585, "grad_norm": 0.0, "kl": 0.18305781483650208, "learning_rate": 1.401842927480328e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11674 }, { "completion_length": 541.25, "epoch": 3.2358647450110865, "grad_norm": 0.42883017659187317, "kl": 0.23044827580451965, "learning_rate": 1.4014496504566415e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11675 }, { "completion_length": 578.75, "epoch": 3.2361419068736144, "grad_norm": 0.0, "kl": 0.3396073579788208, "learning_rate": 1.4010564071209237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11676 }, { "completion_length": 555.0, "epoch": 3.236419068736142, "grad_norm": 0.4164113998413086, "kl": 41926017024.0, "learning_rate": 1.4006631974852353e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11677 }, { "completion_length": 578.25, "epoch": 3.2366962305986697, "grad_norm": 0.0, "kl": 0.17458263039588928, "learning_rate": 1.4002700215616338e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11678 }, { "completion_length": 504.25, "epoch": 3.236973392461197, "grad_norm": 0.0, "kl": 0.21734660863876343, "learning_rate": 1.3998768793621756e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11679 }, { "completion_length": 628.5, "epoch": 3.237250554323725, "grad_norm": 0.0, "kl": 0.15688292682170868, "learning_rate": 1.3994837708989162e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11680 }, { "completion_length": 528.5, "epoch": 3.2375277161862526, "grad_norm": 0.0, "kl": 0.24517251551151276, "learning_rate": 1.3990906961839127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11681 }, { "completion_length": 561.75, "epoch": 3.2378048780487805, "grad_norm": 0.0, "kl": 0.2100280523300171, "learning_rate": 1.3986976552292176e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11682 }, { "completion_length": 656.75, "epoch": 3.2380820399113084, "grad_norm": 7.525580406188965, "kl": 3829449984.0, "learning_rate": 1.3983046480468834e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11683 }, { "completion_length": 602.75, "epoch": 3.238359201773836, "grad_norm": 0.0, "kl": 0.16339237987995148, "learning_rate": 1.3979116746489636e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11684 }, { "completion_length": 705.25, "epoch": 3.2386363636363638, "grad_norm": 0.3539990484714508, "kl": 0.15905404090881348, "learning_rate": 1.3975187350475072e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11685 }, { "completion_length": 564.5, "epoch": 3.238913525498891, "grad_norm": 0.0, "kl": 0.2153209000825882, "learning_rate": 1.397125829254566e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11686 }, { "completion_length": 574.75, "epoch": 3.239190687361419, "grad_norm": 0.0, "kl": 2.3630247606836265e+18, "learning_rate": 1.3967329572821875e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11687 }, { "completion_length": 546.75, "epoch": 3.2394678492239466, "grad_norm": 0.49592843651771545, "kl": 130657779712.0, "learning_rate": 1.3963401191424197e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11688 }, { "completion_length": 659.25, "epoch": 3.2397450110864745, "grad_norm": 0.0, "kl": 0.17321301996707916, "learning_rate": 1.3959473148473088e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11689 }, { "completion_length": 547.25, "epoch": 3.2400221729490024, "grad_norm": 0.3912833631038666, "kl": 47031975936.0, "learning_rate": 1.3955545444089017e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11690 }, { "completion_length": 582.25, "epoch": 3.24029933481153, "grad_norm": 1.762544870376587, "kl": 908269887422464.0, "learning_rate": 1.3951618078392424e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11691 }, { "completion_length": 573.25, "epoch": 3.240576496674058, "grad_norm": 0.0, "kl": 1.132588267326355, "learning_rate": 1.3947691051503734e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11692 }, { "completion_length": 546.5, "epoch": 3.2408536585365852, "grad_norm": 0.0, "kl": 0.18265774846076965, "learning_rate": 1.3943764363543393e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11693 }, { "completion_length": 535.75, "epoch": 3.241130820399113, "grad_norm": 0.3489408791065216, "kl": 0.19804339110851288, "learning_rate": 1.3939838014631795e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11694 }, { "completion_length": 568.0, "epoch": 3.2414079822616406, "grad_norm": 0.0, "kl": 0.19155289232730865, "learning_rate": 1.3935912004889378e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11695 }, { "completion_length": 582.5, "epoch": 3.2416851441241685, "grad_norm": 0.5324374437332153, "kl": 0.3434903025627136, "learning_rate": 1.3931986334436495e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11696 }, { "completion_length": 660.0, "epoch": 3.2419623059866964, "grad_norm": 0.0, "kl": 0.16454704105854034, "learning_rate": 1.392806100339355e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11697 }, { "completion_length": 672.5, "epoch": 3.242239467849224, "grad_norm": 0.39553144574165344, "kl": 9.34879992530495e+18, "learning_rate": 1.3924136011880922e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11698 }, { "completion_length": 540.25, "epoch": 3.242516629711752, "grad_norm": 0.0, "kl": 0.18329483270645142, "learning_rate": 1.3920211360018971e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11699 }, { "completion_length": 543.75, "epoch": 3.2427937915742793, "grad_norm": 0.0, "kl": 0.23657067120075226, "learning_rate": 1.3916287047928045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11700 }, { "completion_length": 644.5, "epoch": 3.243070953436807, "grad_norm": 0.0, "kl": 0.2234976887702942, "learning_rate": 1.391236307572848e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11701 }, { "completion_length": 568.25, "epoch": 3.2433481152993346, "grad_norm": 0.0, "kl": 0.1661471426486969, "learning_rate": 1.3908439443540626e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11702 }, { "completion_length": 550.75, "epoch": 3.2436252771618626, "grad_norm": 0.0, "kl": 0.2602802813053131, "learning_rate": 1.3904516151484794e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11703 }, { "completion_length": 565.5, "epoch": 3.2439024390243905, "grad_norm": 0.0, "kl": 0.23296630382537842, "learning_rate": 1.3900593199681283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11704 }, { "completion_length": 525.5, "epoch": 3.244179600886918, "grad_norm": 0.5291976928710938, "kl": 4.252171004912075e+18, "learning_rate": 1.3896670588250418e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11705 }, { "completion_length": 573.5, "epoch": 3.244456762749446, "grad_norm": 0.0, "kl": 1.0249103307724, "learning_rate": 1.3892748317312468e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11706 }, { "completion_length": 550.75, "epoch": 3.2447339246119733, "grad_norm": 0.0, "kl": 0.7771965861320496, "learning_rate": 1.3888826386987732e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11707 }, { "completion_length": 518.0, "epoch": 3.245011086474501, "grad_norm": 0.0, "kl": 0.40175768733024597, "learning_rate": 1.388490479739647e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11708 }, { "completion_length": 547.25, "epoch": 3.2452882483370287, "grad_norm": 0.4521836042404175, "kl": 1.2437608463192818e+19, "learning_rate": 1.3880983548658939e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11709 }, { "completion_length": 652.75, "epoch": 3.2455654101995566, "grad_norm": 0.0, "kl": 2.1844775963258257e+19, "learning_rate": 1.3877062640895379e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11710 }, { "completion_length": 636.0, "epoch": 3.2458425720620845, "grad_norm": 0.0, "kl": 0.20533621311187744, "learning_rate": 1.3873142074226049e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11711 }, { "completion_length": 615.0, "epoch": 3.246119733924612, "grad_norm": 0.0, "kl": 2.6961512202544087e+18, "learning_rate": 1.3869221848771164e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11712 }, { "completion_length": 605.5, "epoch": 3.24639689578714, "grad_norm": 0.0, "kl": 0.1968877613544464, "learning_rate": 1.386530196465093e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11713 }, { "completion_length": 633.75, "epoch": 3.2466740576496673, "grad_norm": 0.0, "kl": 0.1912446767091751, "learning_rate": 1.3861382421985576e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11714 }, { "completion_length": 455.25, "epoch": 3.2469512195121952, "grad_norm": 0.0, "kl": 0.1873457431793213, "learning_rate": 1.3857463220895278e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11715 }, { "completion_length": 590.75, "epoch": 3.2472283813747227, "grad_norm": 0.4236660599708557, "kl": 0.16988347470760345, "learning_rate": 1.3853544361500254e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11716 }, { "completion_length": 574.5, "epoch": 3.2475055432372506, "grad_norm": 0.0, "kl": 0.18209023773670197, "learning_rate": 1.3849625843920633e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11717 }, { "completion_length": 518.0, "epoch": 3.247782705099778, "grad_norm": 0.0, "kl": 0.3064025342464447, "learning_rate": 1.3845707668276614e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11718 }, { "completion_length": 573.5, "epoch": 3.248059866962306, "grad_norm": 0.0, "kl": 0.22507300972938538, "learning_rate": 1.3841789834688333e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11719 }, { "completion_length": 570.0, "epoch": 3.248337028824834, "grad_norm": 0.0, "kl": 0.18829888105392456, "learning_rate": 1.383787234327595e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11720 }, { "completion_length": 517.0, "epoch": 3.2486141906873613, "grad_norm": 0.0, "kl": 0.1938920021057129, "learning_rate": 1.3833955194159585e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11721 }, { "completion_length": 565.75, "epoch": 3.2488913525498893, "grad_norm": 0.0, "kl": 0.19146420061588287, "learning_rate": 1.3830038387459354e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11722 }, { "completion_length": 511.75, "epoch": 3.2491685144124167, "grad_norm": 0.0, "kl": 0.19256237149238586, "learning_rate": 1.3826121923295393e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11723 }, { "completion_length": 683.0, "epoch": 3.2494456762749446, "grad_norm": 0.0, "kl": 0.1666080355644226, "learning_rate": 1.3822205801787792e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11724 }, { "completion_length": 538.0, "epoch": 3.249722838137472, "grad_norm": 3.4188802242279053, "kl": 932381.8125, "learning_rate": 1.3818290023056637e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11725 }, { "completion_length": 491.25, "epoch": 3.25, "grad_norm": 0.0, "kl": 0.21102994680404663, "learning_rate": 1.3814374587222004e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11726 }, { "completion_length": 537.75, "epoch": 3.250277161862528, "grad_norm": 0.0, "kl": 0.20060178637504578, "learning_rate": 1.3810459494403971e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11727 }, { "completion_length": 671.75, "epoch": 3.2505543237250554, "grad_norm": 0.0, "kl": 0.1561209112405777, "learning_rate": 1.3806544744722609e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11728 }, { "completion_length": 550.25, "epoch": 3.2508314855875833, "grad_norm": 0.0, "kl": 0.19999583065509796, "learning_rate": 1.3802630338297956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11729 }, { "completion_length": 588.0, "epoch": 3.2511086474501107, "grad_norm": 0.0, "kl": 0.4719727039337158, "learning_rate": 1.3798716275250048e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11730 }, { "completion_length": 660.75, "epoch": 3.2513858093126387, "grad_norm": 0.4255709648132324, "kl": 5.301322801112678e+18, "learning_rate": 1.3794802555698907e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11731 }, { "completion_length": 579.5, "epoch": 3.251662971175166, "grad_norm": 0.0, "kl": 0.18770112097263336, "learning_rate": 1.3790889179764572e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11732 }, { "completion_length": 566.0, "epoch": 3.251940133037694, "grad_norm": 0.0, "kl": 0.18763212859630585, "learning_rate": 1.3786976147567033e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11733 }, { "completion_length": 681.25, "epoch": 3.2522172949002215, "grad_norm": 0.0, "kl": 0.17946942150592804, "learning_rate": 1.3783063459226281e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11734 }, { "completion_length": 604.75, "epoch": 3.2524944567627494, "grad_norm": 0.0, "kl": 0.20676499605178833, "learning_rate": 1.3779151114862323e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11735 }, { "completion_length": 604.25, "epoch": 3.2527716186252773, "grad_norm": 0.0, "kl": 0.18993501365184784, "learning_rate": 1.3775239114595113e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11736 }, { "completion_length": 551.5, "epoch": 3.2530487804878048, "grad_norm": 0.0, "kl": 0.20544485747814178, "learning_rate": 1.3771327458544638e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11737 }, { "completion_length": 541.75, "epoch": 3.2533259423503327, "grad_norm": 0.0, "kl": 0.17380988597869873, "learning_rate": 1.3767416146830834e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11738 }, { "completion_length": 529.5, "epoch": 3.25360310421286, "grad_norm": 0.0, "kl": 0.19400152564048767, "learning_rate": 1.3763505179573652e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11739 }, { "completion_length": 489.0, "epoch": 3.253880266075388, "grad_norm": 0.0, "kl": 0.2588781714439392, "learning_rate": 1.3759594556893013e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11740 }, { "completion_length": 595.5, "epoch": 3.2541574279379155, "grad_norm": 0.0, "kl": 0.17394720017910004, "learning_rate": 1.3755684278908865e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11741 }, { "completion_length": 597.5, "epoch": 3.2544345898004434, "grad_norm": 0.0, "kl": 0.1565079540014267, "learning_rate": 1.37517743457411e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11742 }, { "completion_length": 644.5, "epoch": 3.2547117516629713, "grad_norm": 0.0, "kl": 0.1680847704410553, "learning_rate": 1.3747864757509615e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11743 }, { "completion_length": 576.0, "epoch": 3.254988913525499, "grad_norm": 0.0, "kl": 0.18315540254116058, "learning_rate": 1.3743955514334322e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11744 }, { "completion_length": 554.25, "epoch": 3.2552660753880267, "grad_norm": 0.0, "kl": 0.166994109749794, "learning_rate": 1.374004661633508e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11745 }, { "completion_length": 495.25, "epoch": 3.255543237250554, "grad_norm": 0.0, "kl": 19436623872.0, "learning_rate": 1.3736138063631783e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11746 }, { "completion_length": 598.5, "epoch": 3.255820399113082, "grad_norm": 0.0, "kl": 0.18459369242191315, "learning_rate": 1.3732229856344259e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11747 }, { "completion_length": 628.0, "epoch": 3.2560975609756095, "grad_norm": 0.0, "kl": 0.13611723482608795, "learning_rate": 1.372832199459238e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11748 }, { "completion_length": 442.0, "epoch": 3.2563747228381374, "grad_norm": 0.0, "kl": 0.1979006826877594, "learning_rate": 1.372441447849597e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11749 }, { "completion_length": 523.0, "epoch": 3.2566518847006654, "grad_norm": 0.43802016973495483, "kl": 0.192935511469841, "learning_rate": 1.3720507308174873e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11750 }, { "completion_length": 523.75, "epoch": 3.256929046563193, "grad_norm": 0.0, "kl": 0.18036392331123352, "learning_rate": 1.3716600483748892e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11751 }, { "completion_length": 506.25, "epoch": 3.2572062084257207, "grad_norm": 0.0, "kl": 0.2366822212934494, "learning_rate": 1.3712694005337829e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11752 }, { "completion_length": 657.5, "epoch": 3.257483370288248, "grad_norm": 0.0, "kl": 0.16334204375743866, "learning_rate": 1.3708787873061496e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11753 }, { "completion_length": 519.75, "epoch": 3.257760532150776, "grad_norm": 0.0, "kl": 0.15381309390068054, "learning_rate": 1.3704882087039668e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11754 }, { "completion_length": 503.25, "epoch": 3.2580376940133036, "grad_norm": 0.0, "kl": 0.38793861865997314, "learning_rate": 1.370097664739212e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11755 }, { "completion_length": 566.25, "epoch": 3.2583148558758315, "grad_norm": 0.0, "kl": 0.23679105937480927, "learning_rate": 1.3697071554238606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11756 }, { "completion_length": 543.0, "epoch": 3.2585920177383594, "grad_norm": 0.0, "kl": 0.18844474852085114, "learning_rate": 1.3693166807698887e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11757 }, { "completion_length": 572.0, "epoch": 3.258869179600887, "grad_norm": 0.0, "kl": 0.19134274125099182, "learning_rate": 1.3689262407892717e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11758 }, { "completion_length": 566.0, "epoch": 3.2591463414634148, "grad_norm": 0.0, "kl": 0.27276450395584106, "learning_rate": 1.368535835493982e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11759 }, { "completion_length": 563.5, "epoch": 3.259423503325942, "grad_norm": 0.0, "kl": 0.20102372765541077, "learning_rate": 1.3681454648959913e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11760 }, { "completion_length": 484.25, "epoch": 3.25970066518847, "grad_norm": 0.0, "kl": 0.20571082830429077, "learning_rate": 1.3677551290072697e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11761 }, { "completion_length": 505.5, "epoch": 3.2599778270509976, "grad_norm": 0.0, "kl": 0.34586405754089355, "learning_rate": 1.3673648278397894e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11762 }, { "completion_length": 611.0, "epoch": 3.2602549889135255, "grad_norm": 0.0, "kl": 0.1636510044336319, "learning_rate": 1.366974561405518e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11763 }, { "completion_length": 534.0, "epoch": 3.2605321507760534, "grad_norm": 0.0, "kl": 0.1746130883693695, "learning_rate": 1.3665843297164228e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11764 }, { "completion_length": 556.0, "epoch": 3.260809312638581, "grad_norm": 0.0, "kl": 0.1881967931985855, "learning_rate": 1.366194132784472e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11765 }, { "completion_length": 542.5, "epoch": 3.261086474501109, "grad_norm": 1.1976062059402466, "kl": 2.7947140882681037e+17, "learning_rate": 1.3658039706216298e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11766 }, { "completion_length": 566.75, "epoch": 3.2613636363636362, "grad_norm": 0.0, "kl": 0.2144187092781067, "learning_rate": 1.3654138432398639e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11767 }, { "completion_length": 657.75, "epoch": 3.261640798226164, "grad_norm": 0.0, "kl": 0.1601947844028473, "learning_rate": 1.3650237506511333e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11768 }, { "completion_length": 541.25, "epoch": 3.2619179600886916, "grad_norm": 0.0, "kl": 0.17746403813362122, "learning_rate": 1.3646336928674042e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11769 }, { "completion_length": 489.0, "epoch": 3.2621951219512195, "grad_norm": 0.0, "kl": 0.2055979073047638, "learning_rate": 1.3642436699006356e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11770 }, { "completion_length": 541.25, "epoch": 3.2624722838137474, "grad_norm": 0.0, "kl": 0.2427203208208084, "learning_rate": 1.3638536817627898e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11771 }, { "completion_length": 566.0, "epoch": 3.262749445676275, "grad_norm": 0.0, "kl": 0.19875307381153107, "learning_rate": 1.3634637284658256e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11772 }, { "completion_length": 586.75, "epoch": 3.263026607538803, "grad_norm": 0.0, "kl": 2.7092785835266113, "learning_rate": 1.3630738100216997e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11773 }, { "completion_length": 540.5, "epoch": 3.2633037694013303, "grad_norm": 0.0, "kl": 0.1887994259595871, "learning_rate": 1.3626839264423719e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11774 }, { "completion_length": 595.5, "epoch": 3.263580931263858, "grad_norm": 0.9079665541648865, "kl": 115642631979008.0, "learning_rate": 1.362294077739796e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11775 }, { "completion_length": 575.5, "epoch": 3.2638580931263856, "grad_norm": 0.0, "kl": 0.1915963739156723, "learning_rate": 1.3619042639259293e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11776 }, { "completion_length": 567.75, "epoch": 3.2641352549889135, "grad_norm": 0.0, "kl": 0.19020861387252808, "learning_rate": 1.3615144850127232e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11777 }, { "completion_length": 530.0, "epoch": 3.2644124168514415, "grad_norm": 0.0, "kl": 0.18156376481056213, "learning_rate": 1.3611247410121325e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11778 }, { "completion_length": 605.0, "epoch": 3.264689578713969, "grad_norm": 0.0, "kl": 0.17090857028961182, "learning_rate": 1.3607350319361076e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11779 }, { "completion_length": 603.0, "epoch": 3.264966740576497, "grad_norm": 0.0, "kl": 0.2143360674381256, "learning_rate": 1.3603453577966008e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11780 }, { "completion_length": 522.5, "epoch": 3.2652439024390243, "grad_norm": 0.0, "kl": 0.15933702886104584, "learning_rate": 1.3599557186055612e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11781 }, { "completion_length": 526.25, "epoch": 3.265521064301552, "grad_norm": 0.0, "kl": 0.17868690192699432, "learning_rate": 1.3595661143749364e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11782 }, { "completion_length": 619.25, "epoch": 3.2657982261640797, "grad_norm": 0.0, "kl": 0.17968182265758514, "learning_rate": 1.3591765451166756e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11783 }, { "completion_length": 600.5, "epoch": 3.2660753880266076, "grad_norm": 0.4287213087081909, "kl": 4.933221481393737e+19, "learning_rate": 1.3587870108427244e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11784 }, { "completion_length": 593.75, "epoch": 3.2663525498891355, "grad_norm": 0.0, "kl": 0.16926641762256622, "learning_rate": 1.3583975115650283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11785 }, { "completion_length": 532.5, "epoch": 3.266629711751663, "grad_norm": 0.0, "kl": 0.1893835812807083, "learning_rate": 1.3580080472955309e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11786 }, { "completion_length": 684.25, "epoch": 3.266906873614191, "grad_norm": 0.0, "kl": 0.1596437692642212, "learning_rate": 1.3576186180461757e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11787 }, { "completion_length": 538.25, "epoch": 3.2671840354767183, "grad_norm": 0.0, "kl": 0.22610041499137878, "learning_rate": 1.3572292238289064e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11788 }, { "completion_length": 626.0, "epoch": 3.2674611973392462, "grad_norm": 0.3666333556175232, "kl": 30757365760.0, "learning_rate": 1.356839864655663e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11789 }, { "completion_length": 542.5, "epoch": 3.2677383592017737, "grad_norm": 0.0, "kl": 46956852.0, "learning_rate": 1.3564505405383854e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11790 }, { "completion_length": 662.5, "epoch": 3.2680155210643016, "grad_norm": 0.4261302947998047, "kl": 2.184414869187461e+17, "learning_rate": 1.356061251489012e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11791 }, { "completion_length": 533.75, "epoch": 3.2682926829268295, "grad_norm": 0.4085789918899536, "kl": 3.078584454139085e+18, "learning_rate": 1.355671997519482e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11792 }, { "completion_length": 529.25, "epoch": 3.268569844789357, "grad_norm": 0.38674652576446533, "kl": 0.14889252185821533, "learning_rate": 1.3552827786417316e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11793 }, { "completion_length": 466.75, "epoch": 3.268847006651885, "grad_norm": 0.0, "kl": 0.31767508387565613, "learning_rate": 1.3548935948676955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11794 }, { "completion_length": 585.75, "epoch": 3.2691241685144123, "grad_norm": 0.5634457468986511, "kl": 195078193152.0, "learning_rate": 1.3545044462093107e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11795 }, { "completion_length": 552.5, "epoch": 3.2694013303769403, "grad_norm": 0.0, "kl": 0.17486773431301117, "learning_rate": 1.3541153326785081e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11796 }, { "completion_length": 602.0, "epoch": 3.2696784922394677, "grad_norm": 0.0, "kl": 0.16871948540210724, "learning_rate": 1.3537262542872234e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11797 }, { "completion_length": 569.75, "epoch": 3.2699556541019956, "grad_norm": 0.0, "kl": 0.21482209861278534, "learning_rate": 1.353337211047384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11798 }, { "completion_length": 552.25, "epoch": 3.270232815964523, "grad_norm": 0.0, "kl": 0.1674053817987442, "learning_rate": 1.3529482029709237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11799 }, { "completion_length": 557.5, "epoch": 3.270509977827051, "grad_norm": 0.4008658528327942, "kl": 0.21404370665550232, "learning_rate": 1.3525592300697694e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11800 }, { "completion_length": 498.5, "epoch": 3.270787139689579, "grad_norm": 0.0, "kl": 0.18107090890407562, "learning_rate": 1.352170292355851e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11801 }, { "completion_length": 558.75, "epoch": 3.2710643015521064, "grad_norm": 0.0, "kl": 1.5992276668548584, "learning_rate": 1.3517813898410947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11802 }, { "completion_length": 550.75, "epoch": 3.2713414634146343, "grad_norm": 1.8817589282989502, "kl": 2.259725102088192e+16, "learning_rate": 1.3513925225374264e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11803 }, { "completion_length": 605.5, "epoch": 3.2716186252771617, "grad_norm": 2.3525731563568115, "kl": 0.16584207117557526, "learning_rate": 1.3510036904567717e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11804 }, { "completion_length": 507.5, "epoch": 3.2718957871396896, "grad_norm": 0.0, "kl": 0.3562414050102234, "learning_rate": 1.3506148936110542e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11805 }, { "completion_length": 603.75, "epoch": 3.272172949002217, "grad_norm": 0.0, "kl": 1.3992078304290771, "learning_rate": 1.3502261320121968e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11806 }, { "completion_length": 613.0, "epoch": 3.272450110864745, "grad_norm": 0.0, "kl": 0.16515704989433289, "learning_rate": 1.3498374056721198e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11807 }, { "completion_length": 471.75, "epoch": 3.2727272727272725, "grad_norm": 0.0, "kl": 5.576998233795166, "learning_rate": 1.3494487146027464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11808 }, { "completion_length": 518.5, "epoch": 3.2730044345898004, "grad_norm": 0.6374807357788086, "kl": 0.30773457884788513, "learning_rate": 1.3490600588159933e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11809 }, { "completion_length": 609.0, "epoch": 3.2732815964523283, "grad_norm": 0.0, "kl": 0.18734027445316315, "learning_rate": 1.3486714383237814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11810 }, { "completion_length": 519.75, "epoch": 3.2735587583148558, "grad_norm": 0.0, "kl": 5.664284044846694e+16, "learning_rate": 1.3482828531380274e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11811 }, { "completion_length": 605.25, "epoch": 3.2738359201773837, "grad_norm": 0.0, "kl": 0.16670726239681244, "learning_rate": 1.347894303270646e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11812 }, { "completion_length": 583.5, "epoch": 3.274113082039911, "grad_norm": 0.0, "kl": 0.1634041666984558, "learning_rate": 1.347505788733555e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11813 }, { "completion_length": 589.25, "epoch": 3.274390243902439, "grad_norm": 0.0, "kl": 0.17499496042728424, "learning_rate": 1.347117309538667e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11814 }, { "completion_length": 504.75, "epoch": 3.2746674057649665, "grad_norm": 0.0, "kl": 0.21391116082668304, "learning_rate": 1.3467288656978952e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11815 }, { "completion_length": 566.25, "epoch": 3.2749445676274944, "grad_norm": 0.0, "kl": 0.17725107073783875, "learning_rate": 1.3463404572231507e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11816 }, { "completion_length": 582.75, "epoch": 3.2752217294900223, "grad_norm": 0.0, "kl": 0.29475530982017517, "learning_rate": 1.345952084126345e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11817 }, { "completion_length": 532.25, "epoch": 3.27549889135255, "grad_norm": 0.0, "kl": 0.16127969324588776, "learning_rate": 1.3455637464193904e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11818 }, { "completion_length": 552.75, "epoch": 3.2757760532150777, "grad_norm": 0.0, "kl": 567040192.0, "learning_rate": 1.3451754441141914e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11819 }, { "completion_length": 877.0, "epoch": 3.276053215077605, "grad_norm": 0.2779238224029541, "kl": 2.455491644175155e+16, "learning_rate": 1.3447871772226585e-06, "loss": -0.0, "reward": 4.53125, "reward_std": 2.3549039363861084, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.53125, "step": 11820 }, { "completion_length": 596.75, "epoch": 3.276330376940133, "grad_norm": 0.0, "kl": 0.17140233516693115, "learning_rate": 1.3443989457566963e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11821 }, { "completion_length": 533.25, "epoch": 3.2766075388026605, "grad_norm": 0.0, "kl": 0.1778852939605713, "learning_rate": 1.3440107497282123e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11822 }, { "completion_length": 574.5, "epoch": 3.2768847006651884, "grad_norm": 0.0, "kl": 0.2995091676712036, "learning_rate": 1.34362258914911e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11823 }, { "completion_length": 560.5, "epoch": 3.2771618625277164, "grad_norm": 1.4773298501968384, "kl": 557043.625, "learning_rate": 1.343234464031291e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11824 }, { "completion_length": 566.0, "epoch": 3.277439024390244, "grad_norm": 0.3781493902206421, "kl": 1.3140095238271926e+19, "learning_rate": 1.3428463743866603e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11825 }, { "completion_length": 543.5, "epoch": 3.2777161862527717, "grad_norm": 0.0, "kl": 0.18254947662353516, "learning_rate": 1.342458320227117e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11826 }, { "completion_length": 541.5, "epoch": 3.277993348115299, "grad_norm": 0.0, "kl": 0.22616279125213623, "learning_rate": 1.3420703015645631e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11827 }, { "completion_length": 616.5, "epoch": 3.278270509977827, "grad_norm": 0.0, "kl": 0.16060411930084229, "learning_rate": 1.3416823184108942e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11828 }, { "completion_length": 509.0, "epoch": 3.2785476718403546, "grad_norm": 0.0, "kl": 0.18626175820827484, "learning_rate": 1.3412943707780115e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11829 }, { "completion_length": 627.75, "epoch": 3.2788248337028825, "grad_norm": 0.0, "kl": 1175378984960.0, "learning_rate": 1.3409064586778092e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11830 }, { "completion_length": 585.75, "epoch": 3.2791019955654104, "grad_norm": 0.0, "kl": 0.18886056542396545, "learning_rate": 1.340518582122185e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11831 }, { "completion_length": 511.25, "epoch": 3.279379157427938, "grad_norm": 0.0, "kl": 0.19235625863075256, "learning_rate": 1.340130741123033e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11832 }, { "completion_length": 503.5, "epoch": 3.2796563192904657, "grad_norm": 0.0, "kl": 0.2746224105358124, "learning_rate": 1.3397429356922447e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11833 }, { "completion_length": 582.25, "epoch": 3.279933481152993, "grad_norm": 0.0, "kl": 0.15875756740570068, "learning_rate": 1.3393551658417154e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11834 }, { "completion_length": 511.5, "epoch": 3.280210643015521, "grad_norm": 0.0, "kl": 0.17529721558094025, "learning_rate": 1.3389674315833346e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11835 }, { "completion_length": 526.0, "epoch": 3.2804878048780486, "grad_norm": 0.0, "kl": 0.19688239693641663, "learning_rate": 1.3385797329289933e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11836 }, { "completion_length": 598.0, "epoch": 3.2807649667405765, "grad_norm": 0.0, "kl": 0.21473956108093262, "learning_rate": 1.3381920698905788e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11837 }, { "completion_length": 483.5, "epoch": 3.2810421286031044, "grad_norm": 0.0, "kl": 0.4570939242839813, "learning_rate": 1.337804442479982e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11838 }, { "completion_length": 492.25, "epoch": 3.281319290465632, "grad_norm": 0.0, "kl": 0.19437608122825623, "learning_rate": 1.3374168507090867e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11839 }, { "completion_length": 554.75, "epoch": 3.2815964523281598, "grad_norm": 0.0, "kl": 3365220424613888.0, "learning_rate": 1.3370292945897817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11840 }, { "completion_length": 475.75, "epoch": 3.2818736141906872, "grad_norm": 0.0, "kl": 0.22561758756637573, "learning_rate": 1.3366417741339504e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11841 }, { "completion_length": 557.0, "epoch": 3.282150776053215, "grad_norm": 0.0, "kl": 0.16405227780342102, "learning_rate": 1.3362542893534759e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11842 }, { "completion_length": 600.75, "epoch": 3.2824279379157426, "grad_norm": 0.7877844572067261, "kl": 45840748544.0, "learning_rate": 1.3358668402602415e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11843 }, { "completion_length": 537.5, "epoch": 3.2827050997782705, "grad_norm": 0.0, "kl": 1.7202388556526387e+17, "learning_rate": 1.335479426866129e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11844 }, { "completion_length": 515.75, "epoch": 3.2829822616407984, "grad_norm": 0.0, "kl": 0.29973480105400085, "learning_rate": 1.3350920491830178e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11845 }, { "completion_length": 582.5, "epoch": 3.283259423503326, "grad_norm": 0.3691008687019348, "kl": 40845844480.0, "learning_rate": 1.334704707222787e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11846 }, { "completion_length": 582.25, "epoch": 3.283536585365854, "grad_norm": 0.4922265410423279, "kl": 0.1978812962770462, "learning_rate": 1.3343174009973147e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11847 }, { "completion_length": 595.5, "epoch": 3.2838137472283813, "grad_norm": 0.0, "kl": 0.19684970378875732, "learning_rate": 1.333930130518481e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11848 }, { "completion_length": 528.25, "epoch": 3.284090909090909, "grad_norm": 0.0, "kl": 0.19168181717395782, "learning_rate": 1.3335428957981575e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11849 }, { "completion_length": 563.25, "epoch": 3.2843680709534366, "grad_norm": 0.0, "kl": 0.1805885285139084, "learning_rate": 1.3331556968482218e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11850 }, { "completion_length": 554.0, "epoch": 3.2846452328159645, "grad_norm": 0.3942524492740631, "kl": 0.17823104560375214, "learning_rate": 1.3327685336805464e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11851 }, { "completion_length": 538.5, "epoch": 3.2849223946784925, "grad_norm": 0.0, "kl": 0.2572733759880066, "learning_rate": 1.332381406307005e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11852 }, { "completion_length": 465.75, "epoch": 3.28519955654102, "grad_norm": 0.0, "kl": 0.227339506149292, "learning_rate": 1.3319943147394687e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11853 }, { "completion_length": 652.75, "epoch": 3.285476718403548, "grad_norm": 0.0, "kl": 0.2875467836856842, "learning_rate": 1.3316072589898075e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11854 }, { "completion_length": 618.75, "epoch": 3.2857538802660753, "grad_norm": 0.0, "kl": 0.2003139704465866, "learning_rate": 1.331220239069892e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11855 }, { "completion_length": 525.5, "epoch": 3.286031042128603, "grad_norm": 0.0, "kl": 0.313057005405426, "learning_rate": 1.3308332549915895e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11856 }, { "completion_length": 630.0, "epoch": 3.2863082039911307, "grad_norm": 0.3965975344181061, "kl": 74112081920.0, "learning_rate": 1.3304463067667677e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11857 }, { "completion_length": 565.75, "epoch": 3.2865853658536586, "grad_norm": 0.0, "kl": 0.21598534286022186, "learning_rate": 1.3300593944072914e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11858 }, { "completion_length": 496.25, "epoch": 3.2868625277161865, "grad_norm": 0.0, "kl": 1.9546276330947876, "learning_rate": 1.3296725179250274e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11859 }, { "completion_length": 554.5, "epoch": 3.287139689578714, "grad_norm": 0.0, "kl": 0.18285492062568665, "learning_rate": 1.3292856773318387e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11860 }, { "completion_length": 652.25, "epoch": 3.287416851441242, "grad_norm": 0.0, "kl": 0.2199021875858307, "learning_rate": 1.3288988726395885e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11861 }, { "completion_length": 528.25, "epoch": 3.2876940133037693, "grad_norm": 0.0, "kl": 0.21104559302330017, "learning_rate": 1.328512103860138e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11862 }, { "completion_length": 509.25, "epoch": 3.287971175166297, "grad_norm": 0.0, "kl": 0.18257993459701538, "learning_rate": 1.3281253710053476e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11863 }, { "completion_length": 656.25, "epoch": 3.2882483370288247, "grad_norm": 0.0, "kl": 0.14719432592391968, "learning_rate": 1.3277386740870779e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11864 }, { "completion_length": 531.75, "epoch": 3.2885254988913526, "grad_norm": 0.0, "kl": 0.8110499978065491, "learning_rate": 1.3273520131171862e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11865 }, { "completion_length": 467.75, "epoch": 3.2888026607538805, "grad_norm": 0.0, "kl": 0.2232457995414734, "learning_rate": 1.3269653881075308e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11866 }, { "completion_length": 563.0, "epoch": 3.289079822616408, "grad_norm": 0.0, "kl": 0.2139611840248108, "learning_rate": 1.326578799069966e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11867 }, { "completion_length": 436.5, "epoch": 3.289356984478936, "grad_norm": 0.0, "kl": 0.1692211925983429, "learning_rate": 1.3261922460163485e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11868 }, { "completion_length": 585.0, "epoch": 3.2896341463414633, "grad_norm": 0.3850473463535309, "kl": 46164799488.0, "learning_rate": 1.3258057289585324e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11869 }, { "completion_length": 537.75, "epoch": 3.2899113082039912, "grad_norm": 0.0, "kl": 0.20205925405025482, "learning_rate": 1.3254192479083699e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11870 }, { "completion_length": 560.25, "epoch": 3.2901884700665187, "grad_norm": 0.0, "kl": 0.16698713600635529, "learning_rate": 1.3250328028777135e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11871 }, { "completion_length": 683.5, "epoch": 3.2904656319290466, "grad_norm": 0.0, "kl": 40901517312.0, "learning_rate": 1.3246463938784122e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11872 }, { "completion_length": 510.5, "epoch": 3.2907427937915745, "grad_norm": 0.0, "kl": 1.1014493703842163, "learning_rate": 1.3242600209223172e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11873 }, { "completion_length": 570.25, "epoch": 3.291019955654102, "grad_norm": 0.0, "kl": 0.8159713745117188, "learning_rate": 1.3238736840212768e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11874 }, { "completion_length": 537.0, "epoch": 3.29129711751663, "grad_norm": 0.38895443081855774, "kl": 0.1778927445411682, "learning_rate": 1.3234873831871375e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11875 }, { "completion_length": 478.5, "epoch": 3.2915742793791574, "grad_norm": 0.0, "kl": 0.23436239361763, "learning_rate": 1.3231011184317455e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11876 }, { "completion_length": 490.75, "epoch": 3.2918514412416853, "grad_norm": 0.0, "kl": 0.18342766165733337, "learning_rate": 1.3227148897669462e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11877 }, { "completion_length": 502.5, "epoch": 3.2921286031042127, "grad_norm": 0.0, "kl": 67.79700469970703, "learning_rate": 1.322328697204586e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11878 }, { "completion_length": 526.5, "epoch": 3.2924057649667406, "grad_norm": 0.0, "kl": 0.18358293175697327, "learning_rate": 1.3219425407565035e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11879 }, { "completion_length": 651.25, "epoch": 3.292682926829268, "grad_norm": 0.44198545813560486, "kl": 1.6756392280562074e+17, "learning_rate": 1.3215564204345439e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11880 }, { "completion_length": 532.5, "epoch": 3.292960088691796, "grad_norm": 0.0, "kl": 0.1864207684993744, "learning_rate": 1.321170336250545e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11881 }, { "completion_length": 513.5, "epoch": 3.2932372505543235, "grad_norm": 0.0, "kl": 0.1981499344110489, "learning_rate": 1.3207842882163499e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11882 }, { "completion_length": 586.0, "epoch": 3.2935144124168514, "grad_norm": 0.0, "kl": 0.1536451131105423, "learning_rate": 1.320398276343795e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11883 }, { "completion_length": 593.25, "epoch": 3.2937915742793793, "grad_norm": 0.0, "kl": 0.1613592654466629, "learning_rate": 1.3200123006447168e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11884 }, { "completion_length": 539.75, "epoch": 3.2940687361419068, "grad_norm": 0.0, "kl": 0.18132002651691437, "learning_rate": 1.3196263611309539e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11885 }, { "completion_length": 578.25, "epoch": 3.2943458980044347, "grad_norm": 0.0, "kl": 0.3860134780406952, "learning_rate": 1.3192404578143404e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11886 }, { "completion_length": 542.75, "epoch": 3.294623059866962, "grad_norm": 0.3835459053516388, "kl": 0.23308005928993225, "learning_rate": 1.3188545907067097e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11887 }, { "completion_length": 526.25, "epoch": 3.29490022172949, "grad_norm": 0.0, "kl": 0.16371358931064606, "learning_rate": 1.3184687598198947e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11888 }, { "completion_length": 570.0, "epoch": 3.2951773835920175, "grad_norm": 0.0, "kl": 0.17736943066120148, "learning_rate": 1.3180829651657284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11889 }, { "completion_length": 579.75, "epoch": 3.2954545454545454, "grad_norm": 0.0, "kl": 0.1921492964029312, "learning_rate": 1.3176972067560401e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11890 }, { "completion_length": 591.25, "epoch": 3.2957317073170733, "grad_norm": 4.712375164031982, "kl": 6.872576594251088e+18, "learning_rate": 1.3173114846026614e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11891 }, { "completion_length": 510.5, "epoch": 3.296008869179601, "grad_norm": 0.0, "kl": 0.20353856682777405, "learning_rate": 1.3169257987174194e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11892 }, { "completion_length": 502.25, "epoch": 3.2962860310421287, "grad_norm": 0.0, "kl": 0.2074035406112671, "learning_rate": 1.316540149112141e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11893 }, { "completion_length": 479.0, "epoch": 3.296563192904656, "grad_norm": 0.5104115605354309, "kl": 0.16652795672416687, "learning_rate": 1.3161545357986537e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11894 }, { "completion_length": 631.25, "epoch": 3.296840354767184, "grad_norm": 0.0, "kl": 0.15300078690052032, "learning_rate": 1.3157689587887823e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11895 }, { "completion_length": 473.5, "epoch": 3.2971175166297115, "grad_norm": 0.0, "kl": 0.2133079469203949, "learning_rate": 1.315383418094351e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11896 }, { "completion_length": 604.5, "epoch": 3.2973946784922394, "grad_norm": 0.0, "kl": 0.17415489256381989, "learning_rate": 1.3149979137271806e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11897 }, { "completion_length": 549.25, "epoch": 3.2976718403547673, "grad_norm": 0.0, "kl": 0.17714564502239227, "learning_rate": 1.3146124456990955e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11898 }, { "completion_length": 586.5, "epoch": 3.297949002217295, "grad_norm": 0.4260779023170471, "kl": 0.1581851989030838, "learning_rate": 1.3142270140219172e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11899 }, { "completion_length": 499.75, "epoch": 3.2982261640798227, "grad_norm": 0.0, "kl": 0.23013430833816528, "learning_rate": 1.3138416187074616e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11900 }, { "completion_length": 599.5, "epoch": 3.29850332594235, "grad_norm": 0.5009694695472717, "kl": 41395757056.0, "learning_rate": 1.3134562597675504e-06, "loss": -0.0, "reward": 1.1875, "reward_std": 0.680838942527771, "rewards/confident_score_func": 0.125, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.125, "rewards/xmlcount_reward_func": 0.5625, "step": 11901 }, { "completion_length": 570.75, "epoch": 3.298780487804878, "grad_norm": 0.0, "kl": 0.22510135173797607, "learning_rate": 1.3130709372139987e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11902 }, { "completion_length": 601.0, "epoch": 3.2990576496674056, "grad_norm": 0.0, "kl": 0.2177824229001999, "learning_rate": 1.312685651058625e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11903 }, { "completion_length": 595.5, "epoch": 3.2993348115299335, "grad_norm": 0.36944010853767395, "kl": 2.2991867032580915e+18, "learning_rate": 1.3123004013132434e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11904 }, { "completion_length": 500.5, "epoch": 3.2996119733924614, "grad_norm": 0.0, "kl": 0.2271694391965866, "learning_rate": 1.3119151879896674e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11905 }, { "completion_length": 562.25, "epoch": 3.299889135254989, "grad_norm": 0.0, "kl": 0.1677131950855255, "learning_rate": 1.3115300110997097e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11906 }, { "completion_length": 526.0, "epoch": 3.3001662971175167, "grad_norm": 0.0, "kl": 0.21824952960014343, "learning_rate": 1.3111448706551835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11907 }, { "completion_length": 562.25, "epoch": 3.300443458980044, "grad_norm": 0.0, "kl": 0.2675701379776001, "learning_rate": 1.3107597666678984e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11908 }, { "completion_length": 497.75, "epoch": 3.300720620842572, "grad_norm": 0.4817970395088196, "kl": 1.9995161834972774e+18, "learning_rate": 1.3103746991496635e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11909 }, { "completion_length": 576.25, "epoch": 3.3009977827050996, "grad_norm": 0.0, "kl": 0.18134866654872894, "learning_rate": 1.3099896681122886e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11910 }, { "completion_length": 502.25, "epoch": 3.3012749445676275, "grad_norm": 0.5131590366363525, "kl": 2.5934254981387387e+18, "learning_rate": 1.3096046735675795e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11911 }, { "completion_length": 541.25, "epoch": 3.3015521064301554, "grad_norm": 0.0, "kl": 0.16613256931304932, "learning_rate": 1.3092197155273449e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11912 }, { "completion_length": 496.75, "epoch": 3.301829268292683, "grad_norm": 0.0, "kl": 0.21087926626205444, "learning_rate": 1.308834794003386e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11913 }, { "completion_length": 578.75, "epoch": 3.3021064301552108, "grad_norm": 0.0, "kl": 0.1749260127544403, "learning_rate": 1.3084499090075093e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11914 }, { "completion_length": 550.75, "epoch": 3.3023835920177382, "grad_norm": 0.0, "kl": 2.4470763206481934, "learning_rate": 1.3080650605515177e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11915 }, { "completion_length": 586.75, "epoch": 3.302660753880266, "grad_norm": 0.0, "kl": 0.18952800333499908, "learning_rate": 1.3076802486472123e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11916 }, { "completion_length": 528.5, "epoch": 3.3029379157427936, "grad_norm": 0.0, "kl": 0.22693577408790588, "learning_rate": 1.3072954733063942e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11917 }, { "completion_length": 490.0, "epoch": 3.3032150776053215, "grad_norm": 0.0, "kl": 0.2053333818912506, "learning_rate": 1.306910734540861e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11918 }, { "completion_length": 542.25, "epoch": 3.3034922394678494, "grad_norm": 0.0, "kl": 0.1934298872947693, "learning_rate": 1.3065260323624137e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11919 }, { "completion_length": 472.75, "epoch": 3.303769401330377, "grad_norm": 0.4483228921890259, "kl": 460677021696.0, "learning_rate": 1.3061413667828471e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11920 }, { "completion_length": 605.5, "epoch": 3.304046563192905, "grad_norm": 0.0, "kl": 0.16048437356948853, "learning_rate": 1.3057567378139596e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11921 }, { "completion_length": 531.75, "epoch": 3.3043237250554323, "grad_norm": 0.0, "kl": 0.19363661110401154, "learning_rate": 1.305372145467545e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11922 }, { "completion_length": 587.5, "epoch": 3.30460088691796, "grad_norm": 0.0, "kl": 0.2084132581949234, "learning_rate": 1.3049875897553962e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11923 }, { "completion_length": 532.25, "epoch": 3.3048780487804876, "grad_norm": 0.0, "kl": 0.20439594984054565, "learning_rate": 1.3046030706893079e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11924 }, { "completion_length": 558.75, "epoch": 3.3051552106430155, "grad_norm": 0.43316447734832764, "kl": 48115695616.0, "learning_rate": 1.3042185882810703e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11925 }, { "completion_length": 565.25, "epoch": 3.3054323725055434, "grad_norm": 0.0, "kl": 0.21131514012813568, "learning_rate": 1.303834142542475e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11926 }, { "completion_length": 594.75, "epoch": 3.305709534368071, "grad_norm": 0.40985554456710815, "kl": 65620885504.0, "learning_rate": 1.3034497334853092e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11927 }, { "completion_length": 495.25, "epoch": 3.305986696230599, "grad_norm": 0.0, "kl": 0.21269585192203522, "learning_rate": 1.3030653611213623e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11928 }, { "completion_length": 499.25, "epoch": 3.3062638580931263, "grad_norm": 0.0, "kl": 0.22770391404628754, "learning_rate": 1.302681025462424e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11929 }, { "completion_length": 583.0, "epoch": 3.306541019955654, "grad_norm": 0.0, "kl": 0.1685987263917923, "learning_rate": 1.3022967265202758e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11930 }, { "completion_length": 464.25, "epoch": 3.3068181818181817, "grad_norm": 0.0, "kl": 0.2072884440422058, "learning_rate": 1.3019124643067058e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11931 }, { "completion_length": 505.5, "epoch": 3.3070953436807096, "grad_norm": 0.0, "kl": 0.20832261443138123, "learning_rate": 1.3015282388334953e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11932 }, { "completion_length": 560.0, "epoch": 3.3073725055432375, "grad_norm": 0.0, "kl": 0.21179169416427612, "learning_rate": 1.3011440501124295e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11933 }, { "completion_length": 469.0, "epoch": 3.307649667405765, "grad_norm": 0.0, "kl": 4.694835186004639, "learning_rate": 1.3007598981552886e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11934 }, { "completion_length": 573.25, "epoch": 3.307926829268293, "grad_norm": 0.0, "kl": 0.2611740827560425, "learning_rate": 1.3003757829738527e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11935 }, { "completion_length": 562.0, "epoch": 3.3082039911308203, "grad_norm": 0.0, "kl": 0.1848919838666916, "learning_rate": 1.2999917045799004e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11936 }, { "completion_length": 611.0, "epoch": 3.308481152993348, "grad_norm": 0.0, "kl": 0.2023225724697113, "learning_rate": 1.2996076629852114e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11937 }, { "completion_length": 529.75, "epoch": 3.3087583148558757, "grad_norm": 0.0, "kl": 0.2004481852054596, "learning_rate": 1.2992236582015621e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11938 }, { "completion_length": 536.75, "epoch": 3.3090354767184036, "grad_norm": 0.0, "kl": 0.21642261743545532, "learning_rate": 1.298839690240727e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11939 }, { "completion_length": 601.25, "epoch": 3.3093126385809315, "grad_norm": 0.5154387354850769, "kl": 9.337397989724914e+18, "learning_rate": 1.2984557591144827e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11940 }, { "completion_length": 505.75, "epoch": 3.309589800443459, "grad_norm": 0.0, "kl": 0.22139610350131989, "learning_rate": 1.2980718648346014e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11941 }, { "completion_length": 502.0, "epoch": 3.309866962305987, "grad_norm": 0.0, "kl": 0.20609337091445923, "learning_rate": 1.297688007412858e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11942 }, { "completion_length": 470.25, "epoch": 3.3101441241685143, "grad_norm": 0.0, "kl": 0.21747703850269318, "learning_rate": 1.2973041868610196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11943 }, { "completion_length": 491.25, "epoch": 3.3104212860310422, "grad_norm": 0.0, "kl": 0.19697222113609314, "learning_rate": 1.2969204031908588e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11944 }, { "completion_length": 546.75, "epoch": 3.3106984478935697, "grad_norm": 0.0, "kl": 2.0213711261749268, "learning_rate": 1.2965366564141457e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11945 }, { "completion_length": 609.25, "epoch": 3.3109756097560976, "grad_norm": 0.0, "kl": 0.17966265976428986, "learning_rate": 1.296152946542647e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11946 }, { "completion_length": 513.5, "epoch": 3.3112527716186255, "grad_norm": 0.0, "kl": 0.21606439352035522, "learning_rate": 1.2957692735881295e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11947 }, { "completion_length": 484.0, "epoch": 3.311529933481153, "grad_norm": 0.0, "kl": 345382.0625, "learning_rate": 1.295385637562358e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11948 }, { "completion_length": 569.25, "epoch": 3.311807095343681, "grad_norm": 0.0, "kl": 0.17158015072345734, "learning_rate": 1.2950020384770986e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11949 }, { "completion_length": 566.5, "epoch": 3.3120842572062084, "grad_norm": 0.0, "kl": 0.19695140421390533, "learning_rate": 1.2946184763441145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11950 }, { "completion_length": 588.75, "epoch": 3.3123614190687363, "grad_norm": 0.0, "kl": 0.263374924659729, "learning_rate": 1.294234951175166e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11951 }, { "completion_length": 621.25, "epoch": 3.3126385809312637, "grad_norm": 0.43221190571784973, "kl": 1.3602821133408666e+18, "learning_rate": 1.293851462982017e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11952 }, { "completion_length": 597.5, "epoch": 3.3129157427937916, "grad_norm": 0.0, "kl": 0.18499954044818878, "learning_rate": 1.2934680117764254e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11953 }, { "completion_length": 542.5, "epoch": 3.313192904656319, "grad_norm": 0.0, "kl": 0.2901802957057953, "learning_rate": 1.293084597570151e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11954 }, { "completion_length": 609.5, "epoch": 3.313470066518847, "grad_norm": 0.0, "kl": 0.2032068520784378, "learning_rate": 1.2927012203749524e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11955 }, { "completion_length": 561.25, "epoch": 3.3137472283813745, "grad_norm": 0.0, "kl": 0.1815030872821808, "learning_rate": 1.2923178802025844e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11956 }, { "completion_length": 652.25, "epoch": 3.3140243902439024, "grad_norm": 0.0, "kl": 0.16593043506145477, "learning_rate": 1.2919345770648023e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11957 }, { "completion_length": 582.0, "epoch": 3.3143015521064303, "grad_norm": 0.3911333382129669, "kl": 1.3224319753104916e+18, "learning_rate": 1.2915513109733624e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11958 }, { "completion_length": 531.75, "epoch": 3.3145787139689578, "grad_norm": 0.0, "kl": 0.17200063169002533, "learning_rate": 1.2911680819400168e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11959 }, { "completion_length": 553.25, "epoch": 3.3148558758314857, "grad_norm": 0.0, "kl": 0.17429682612419128, "learning_rate": 1.2907848899765167e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11960 }, { "completion_length": 535.75, "epoch": 3.315133037694013, "grad_norm": 6.7846527099609375, "kl": 2076955.375, "learning_rate": 1.2904017350946146e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11961 }, { "completion_length": 601.25, "epoch": 3.315410199556541, "grad_norm": 0.3858075737953186, "kl": 9.266050405320622e+17, "learning_rate": 1.2900186173060582e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11962 }, { "completion_length": 576.5, "epoch": 3.3156873614190685, "grad_norm": 0.0, "kl": 0.574053168296814, "learning_rate": 1.2896355366226e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11963 }, { "completion_length": 568.25, "epoch": 3.3159645232815964, "grad_norm": 0.0, "kl": 0.3739520013332367, "learning_rate": 1.2892524930559823e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11964 }, { "completion_length": 560.75, "epoch": 3.3162416851441243, "grad_norm": 2.8231449127197266, "kl": 8449418752.0, "learning_rate": 1.2888694866179555e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11965 }, { "completion_length": 530.25, "epoch": 3.316518847006652, "grad_norm": 0.0, "kl": 0.20392614603042603, "learning_rate": 1.288486517320262e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11966 }, { "completion_length": 542.25, "epoch": 3.3167960088691797, "grad_norm": 0.0, "kl": 4.0349097750062694e+17, "learning_rate": 1.2881035851746486e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11967 }, { "completion_length": 521.0, "epoch": 3.317073170731707, "grad_norm": 0.0, "kl": 0.1954166144132614, "learning_rate": 1.2877206901928568e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11968 }, { "completion_length": 511.5, "epoch": 3.317350332594235, "grad_norm": 0.0, "kl": 0.21202223002910614, "learning_rate": 1.2873378323866273e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11969 }, { "completion_length": 484.25, "epoch": 3.3176274944567625, "grad_norm": 0.0, "kl": 0.21669849753379822, "learning_rate": 1.2869550117677031e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11970 }, { "completion_length": 504.75, "epoch": 3.3179046563192904, "grad_norm": 0.0, "kl": 0.2896517813205719, "learning_rate": 1.2865722283478215e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11971 }, { "completion_length": 528.75, "epoch": 3.3181818181818183, "grad_norm": 0.0, "kl": 0.2084892988204956, "learning_rate": 1.286189482138724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11972 }, { "completion_length": 618.0, "epoch": 3.318458980044346, "grad_norm": 11.26774787902832, "kl": 6800176.5, "learning_rate": 1.2858067731521434e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11973 }, { "completion_length": 565.25, "epoch": 3.3187361419068737, "grad_norm": 0.0, "kl": 0.17676310241222382, "learning_rate": 1.2854241013998187e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11974 }, { "completion_length": 539.25, "epoch": 3.319013303769401, "grad_norm": 0.0, "kl": 0.1739775687456131, "learning_rate": 1.285041466893485e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11975 }, { "completion_length": 570.25, "epoch": 3.319290465631929, "grad_norm": 0.0, "kl": 0.1635681539773941, "learning_rate": 1.2846588696448753e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11976 }, { "completion_length": 618.0, "epoch": 3.3195676274944566, "grad_norm": 0.0, "kl": 0.18389888107776642, "learning_rate": 1.2842763096657224e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11977 }, { "completion_length": 554.75, "epoch": 3.3198447893569845, "grad_norm": 0.0, "kl": 0.18969769775867462, "learning_rate": 1.283893786967757e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11978 }, { "completion_length": 566.75, "epoch": 3.3201219512195124, "grad_norm": 0.0, "kl": 0.19283954799175262, "learning_rate": 1.283511301562711e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11979 }, { "completion_length": 578.75, "epoch": 3.32039911308204, "grad_norm": 0.0, "kl": 0.18223069608211517, "learning_rate": 1.2831288534623131e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11980 }, { "completion_length": 579.75, "epoch": 3.3206762749445677, "grad_norm": 0.4810508191585541, "kl": 5.843902102606184e+18, "learning_rate": 1.2827464426782904e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11981 }, { "completion_length": 511.0, "epoch": 3.320953436807095, "grad_norm": 0.0, "kl": 0.19606226682662964, "learning_rate": 1.2823640692223713e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11982 }, { "completion_length": 502.5, "epoch": 3.321230598669623, "grad_norm": 0.0, "kl": 0.2860717475414276, "learning_rate": 1.28198173310628e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11983 }, { "completion_length": 559.5, "epoch": 3.3215077605321506, "grad_norm": 0.40726277232170105, "kl": 4.781917356441993e+18, "learning_rate": 1.281599434341743e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11984 }, { "completion_length": 546.75, "epoch": 3.3217849223946785, "grad_norm": 0.0, "kl": 0.1857040822505951, "learning_rate": 1.2812171729404835e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11985 }, { "completion_length": 543.5, "epoch": 3.3220620842572064, "grad_norm": 0.0, "kl": 244843184128.0, "learning_rate": 1.280834948914223e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11986 }, { "completion_length": 605.75, "epoch": 3.322339246119734, "grad_norm": 0.0, "kl": 0.19883157312870026, "learning_rate": 1.280452762274682e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11987 }, { "completion_length": 509.75, "epoch": 3.3226164079822618, "grad_norm": 0.0, "kl": 0.302322655916214, "learning_rate": 1.2800706130335824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11988 }, { "completion_length": 509.5, "epoch": 3.3228935698447892, "grad_norm": 0.0, "kl": 0.2263992577791214, "learning_rate": 1.2796885012026426e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11989 }, { "completion_length": 555.75, "epoch": 3.323170731707317, "grad_norm": 0.0, "kl": 25639772.0, "learning_rate": 1.279306426793579e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11990 }, { "completion_length": 534.25, "epoch": 3.3234478935698446, "grad_norm": 0.0, "kl": 0.3056204319000244, "learning_rate": 1.2789243898181104e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11991 }, { "completion_length": 722.5, "epoch": 3.3237250554323725, "grad_norm": 0.0, "kl": 0.17976316809654236, "learning_rate": 1.2785423902879499e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11992 }, { "completion_length": 550.5, "epoch": 3.3240022172949004, "grad_norm": 0.0, "kl": 0.21574759483337402, "learning_rate": 1.2781604282148153e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11993 }, { "completion_length": 536.0, "epoch": 3.324279379157428, "grad_norm": 0.0, "kl": 0.18928444385528564, "learning_rate": 1.2777785036104157e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11994 }, { "completion_length": 583.0, "epoch": 3.324556541019956, "grad_norm": 0.0, "kl": 0.18778523802757263, "learning_rate": 1.277396616486465e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11995 }, { "completion_length": 534.75, "epoch": 3.3248337028824833, "grad_norm": 0.0, "kl": 0.2121710330247879, "learning_rate": 1.2770147668546753e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11996 }, { "completion_length": 579.5, "epoch": 3.325110864745011, "grad_norm": 0.0, "kl": 0.2780712842941284, "learning_rate": 1.2766329547267552e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11997 }, { "completion_length": 628.75, "epoch": 3.3253880266075386, "grad_norm": 0.0, "kl": 0.2094404548406601, "learning_rate": 1.2762511801144135e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11998 }, { "completion_length": 589.75, "epoch": 3.3256651884700665, "grad_norm": 0.0, "kl": 0.2007807493209839, "learning_rate": 1.2758694430293562e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 11999 }, { "completion_length": 561.75, "epoch": 3.3259423503325944, "grad_norm": 0.0, "kl": 0.18304035067558289, "learning_rate": 1.2754877434832916e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12000 }, { "completion_length": 527.5, "epoch": 3.326219512195122, "grad_norm": 0.5809678435325623, "kl": 637610688512.0, "learning_rate": 1.275106081487924e-06, "loss": -0.0, "reward": 3.375, "reward_std": 1.6007810831069946, "rewards/confident_score_func": 0.625, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12001 }, { "completion_length": 530.25, "epoch": 3.32649667405765, "grad_norm": 0.0, "kl": 0.17182813584804535, "learning_rate": 1.2747244570549578e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12002 }, { "completion_length": 488.5, "epoch": 3.3267738359201773, "grad_norm": 0.0, "kl": 0.22995489835739136, "learning_rate": 1.2743428701960948e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12003 }, { "completion_length": 477.5, "epoch": 3.327050997782705, "grad_norm": 0.0, "kl": 0.22191360592842102, "learning_rate": 1.2739613209230367e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12004 }, { "completion_length": 649.5, "epoch": 3.3273281596452327, "grad_norm": 0.0, "kl": 325967712.0, "learning_rate": 1.273579809247486e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12005 }, { "completion_length": 617.0, "epoch": 3.3276053215077606, "grad_norm": 0.0, "kl": 0.2130177915096283, "learning_rate": 1.2731983351811405e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12006 }, { "completion_length": 513.0, "epoch": 3.3278824833702885, "grad_norm": 0.0, "kl": 0.2588014006614685, "learning_rate": 1.2728168987356992e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12007 }, { "completion_length": 526.75, "epoch": 3.328159645232816, "grad_norm": 0.0, "kl": 0.16679899394512177, "learning_rate": 1.2724354999228572e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12008 }, { "completion_length": 542.75, "epoch": 3.328436807095344, "grad_norm": 0.0, "kl": 0.16973750293254852, "learning_rate": 1.2720541387543135e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12009 }, { "completion_length": 590.5, "epoch": 3.3287139689578713, "grad_norm": 0.0, "kl": 0.18957999348640442, "learning_rate": 1.2716728152417608e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12010 }, { "completion_length": 486.5, "epoch": 3.328991130820399, "grad_norm": 0.0, "kl": 0.31631484627723694, "learning_rate": 1.271291529396892e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12011 }, { "completion_length": 496.75, "epoch": 3.3292682926829267, "grad_norm": 0.0, "kl": 0.2025224268436432, "learning_rate": 1.2709102812314017e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12012 }, { "completion_length": 584.0, "epoch": 3.3295454545454546, "grad_norm": 0.4124087989330292, "kl": 0.42007976770401, "learning_rate": 1.2705290707569795e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12013 }, { "completion_length": 696.0, "epoch": 3.3298226164079825, "grad_norm": 0.3477679193019867, "kl": 2.0779629021652582e+18, "learning_rate": 1.270147897985317e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12014 }, { "completion_length": 534.25, "epoch": 3.33009977827051, "grad_norm": 0.0, "kl": 0.2381838858127594, "learning_rate": 1.2697667629281025e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12015 }, { "completion_length": 575.25, "epoch": 3.330376940133038, "grad_norm": 0.0, "kl": 0.21828536689281464, "learning_rate": 1.2693856655970236e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12016 }, { "completion_length": 488.5, "epoch": 3.3306541019955653, "grad_norm": 0.0, "kl": 0.19908122718334198, "learning_rate": 1.2690046060037661e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12017 }, { "completion_length": 688.0, "epoch": 3.3309312638580932, "grad_norm": 0.0, "kl": 0.17844633758068085, "learning_rate": 1.2686235841600175e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12018 }, { "completion_length": 490.5, "epoch": 3.3312084257206207, "grad_norm": 0.0, "kl": 0.20710819959640503, "learning_rate": 1.2682426000774617e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12019 }, { "completion_length": 481.25, "epoch": 3.3314855875831486, "grad_norm": 0.0, "kl": 0.22707650065422058, "learning_rate": 1.2678616537677798e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12020 }, { "completion_length": 538.5, "epoch": 3.3317627494456765, "grad_norm": 0.0, "kl": 0.21994523704051971, "learning_rate": 1.2674807452426566e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12021 }, { "completion_length": 564.75, "epoch": 3.332039911308204, "grad_norm": 0.0, "kl": 0.18581567704677582, "learning_rate": 1.2670998745137709e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12022 }, { "completion_length": 664.0, "epoch": 3.332317073170732, "grad_norm": 0.0, "kl": 0.1807631254196167, "learning_rate": 1.266719041592805e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12023 }, { "completion_length": 672.75, "epoch": 3.3325942350332594, "grad_norm": 0.0, "kl": 0.18049408495426178, "learning_rate": 1.266338246491434e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12024 }, { "completion_length": 585.25, "epoch": 3.3328713968957873, "grad_norm": 0.43106773495674133, "kl": 0.19254986941814423, "learning_rate": 1.2659574892213372e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12025 }, { "completion_length": 541.5, "epoch": 3.3331485587583147, "grad_norm": 0.0, "kl": 0.22765301167964935, "learning_rate": 1.2655767697941916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12026 }, { "completion_length": 548.75, "epoch": 3.3334257206208426, "grad_norm": 0.4132409989833832, "kl": 0.9313096404075623, "learning_rate": 1.2651960882216712e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12027 }, { "completion_length": 522.75, "epoch": 3.33370288248337, "grad_norm": 0.0, "kl": 0.17390692234039307, "learning_rate": 1.2648154445154503e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12028 }, { "completion_length": 504.5, "epoch": 3.333980044345898, "grad_norm": 0.0, "kl": 0.19806842505931854, "learning_rate": 1.2644348386872002e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12029 }, { "completion_length": 475.0, "epoch": 3.3342572062084255, "grad_norm": 0.0, "kl": 0.22138267755508423, "learning_rate": 1.2640542707485952e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12030 }, { "completion_length": 517.25, "epoch": 3.3345343680709534, "grad_norm": 0.0, "kl": 0.20163710415363312, "learning_rate": 1.2636737407113043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12031 }, { "completion_length": 570.75, "epoch": 3.3348115299334813, "grad_norm": 0.0, "kl": 0.2250201255083084, "learning_rate": 1.2632932485869965e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12032 }, { "completion_length": 516.75, "epoch": 3.3350886917960088, "grad_norm": 0.0, "kl": 2.937813679787213e+16, "learning_rate": 1.2629127943873393e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12033 }, { "completion_length": 580.25, "epoch": 3.3353658536585367, "grad_norm": 4.144503116607666, "kl": 4.439823657992192e+16, "learning_rate": 1.2625323781240007e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12034 }, { "completion_length": 531.5, "epoch": 3.335643015521064, "grad_norm": 0.0, "kl": 5.217578411102295, "learning_rate": 1.2621519998086467e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12035 }, { "completion_length": 548.0, "epoch": 3.335920177383592, "grad_norm": 0.0, "kl": 0.22419776022434235, "learning_rate": 1.2617716594529421e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12036 }, { "completion_length": 540.0, "epoch": 3.3361973392461195, "grad_norm": 0.0, "kl": 0.21386536955833435, "learning_rate": 1.26139135706855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12037 }, { "completion_length": 521.0, "epoch": 3.3364745011086474, "grad_norm": 0.0, "kl": 0.2034420669078827, "learning_rate": 1.2610110926671309e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12038 }, { "completion_length": 544.0, "epoch": 3.3367516629711753, "grad_norm": 0.0, "kl": 0.20051345229148865, "learning_rate": 1.260630866260349e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12039 }, { "completion_length": 589.0, "epoch": 3.337028824833703, "grad_norm": 0.0, "kl": 0.18590141832828522, "learning_rate": 1.2602506778598627e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12040 }, { "completion_length": 589.75, "epoch": 3.3373059866962307, "grad_norm": 0.0, "kl": 0.17464110255241394, "learning_rate": 1.2598705274773299e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12041 }, { "completion_length": 550.5, "epoch": 3.337583148558758, "grad_norm": 0.0, "kl": 1.3394458293914795, "learning_rate": 1.25949041512441e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12042 }, { "completion_length": 567.0, "epoch": 3.337860310421286, "grad_norm": 0.43124833703041077, "kl": 0.20600363612174988, "learning_rate": 1.2591103408127577e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12043 }, { "completion_length": 553.0, "epoch": 3.3381374722838135, "grad_norm": 0.0, "kl": 0.20425479114055634, "learning_rate": 1.2587303045540315e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12044 }, { "completion_length": 576.0, "epoch": 3.3384146341463414, "grad_norm": 0.0, "kl": 3.186554090805002e+17, "learning_rate": 1.2583503063598813e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12045 }, { "completion_length": 490.25, "epoch": 3.3386917960088693, "grad_norm": 0.0, "kl": 0.23981904983520508, "learning_rate": 1.2579703462419629e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12046 }, { "completion_length": 498.25, "epoch": 3.338968957871397, "grad_norm": 0.0, "kl": 0.2686111032962799, "learning_rate": 1.2575904242119264e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12047 }, { "completion_length": 588.25, "epoch": 3.3392461197339247, "grad_norm": 0.0, "kl": 0.6890186667442322, "learning_rate": 1.2572105402814239e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12048 }, { "completion_length": 548.0, "epoch": 3.339523281596452, "grad_norm": 0.0, "kl": 0.19340626895427704, "learning_rate": 1.2568306944621045e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12049 }, { "completion_length": 473.5, "epoch": 3.33980044345898, "grad_norm": 0.0, "kl": 0.2026233673095703, "learning_rate": 1.2564508867656156e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12050 }, { "completion_length": 535.75, "epoch": 3.3400776053215075, "grad_norm": 0.0, "kl": 0.2452063262462616, "learning_rate": 1.2560711172036056e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12051 }, { "completion_length": 491.0, "epoch": 3.3403547671840355, "grad_norm": 0.0, "kl": 0.3884827196598053, "learning_rate": 1.2556913857877196e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12052 }, { "completion_length": 476.5, "epoch": 3.3406319290465634, "grad_norm": 0.0, "kl": 0.2165423184633255, "learning_rate": 1.255311692529603e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12053 }, { "completion_length": 512.5, "epoch": 3.340909090909091, "grad_norm": 0.0, "kl": 0.2476009726524353, "learning_rate": 1.254932037440898e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12054 }, { "completion_length": 443.0, "epoch": 3.3411862527716187, "grad_norm": 0.0, "kl": 0.1982477456331253, "learning_rate": 1.254552420533248e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12055 }, { "completion_length": 507.5, "epoch": 3.341463414634146, "grad_norm": 0.0, "kl": 0.3172246813774109, "learning_rate": 1.2541728418182953e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12056 }, { "completion_length": 563.25, "epoch": 3.341740576496674, "grad_norm": 0.0, "kl": 0.1988593190908432, "learning_rate": 1.253793301307679e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12057 }, { "completion_length": 529.0, "epoch": 3.3420177383592016, "grad_norm": 0.0, "kl": 0.193453848361969, "learning_rate": 1.2534137990130379e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12058 }, { "completion_length": 495.75, "epoch": 3.3422949002217295, "grad_norm": 0.0, "kl": 0.24241186678409576, "learning_rate": 1.2530343349460091e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12059 }, { "completion_length": 542.0, "epoch": 3.3425720620842574, "grad_norm": 0.0, "kl": 0.19901993870735168, "learning_rate": 1.252654909118231e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12060 }, { "completion_length": 528.0, "epoch": 3.342849223946785, "grad_norm": 0.0, "kl": 0.2468581199645996, "learning_rate": 1.252275521541338e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12061 }, { "completion_length": 565.75, "epoch": 3.3431263858093128, "grad_norm": 0.0, "kl": 0.2873724102973938, "learning_rate": 1.2518961722269645e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12062 }, { "completion_length": 587.25, "epoch": 3.3434035476718402, "grad_norm": 0.0, "kl": 0.19129690527915955, "learning_rate": 1.2515168611867423e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12063 }, { "completion_length": 471.5, "epoch": 3.343680709534368, "grad_norm": 0.0, "kl": 0.19114434719085693, "learning_rate": 1.2511375884323044e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12064 }, { "completion_length": 618.0, "epoch": 3.3439578713968956, "grad_norm": 0.0, "kl": 0.19520513713359833, "learning_rate": 1.2507583539752824e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12065 }, { "completion_length": 466.75, "epoch": 3.3442350332594235, "grad_norm": 0.0, "kl": 0.2053682804107666, "learning_rate": 1.250379157827305e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12066 }, { "completion_length": 580.75, "epoch": 3.3445121951219514, "grad_norm": 0.0, "kl": 0.22052259743213654, "learning_rate": 1.2500000000000007e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12067 }, { "completion_length": 604.0, "epoch": 3.344789356984479, "grad_norm": 0.0, "kl": 0.19145545363426208, "learning_rate": 1.2496208805049953e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12068 }, { "completion_length": 543.75, "epoch": 3.345066518847007, "grad_norm": 0.0, "kl": 0.24415090680122375, "learning_rate": 1.2492417993539172e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12069 }, { "completion_length": 572.25, "epoch": 3.3453436807095343, "grad_norm": 0.0, "kl": 0.22634050250053406, "learning_rate": 1.24886275655839e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12070 }, { "completion_length": 534.25, "epoch": 3.345620842572062, "grad_norm": 0.0, "kl": 0.2132735699415207, "learning_rate": 1.2484837521300366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12071 }, { "completion_length": 547.5, "epoch": 3.3458980044345896, "grad_norm": 0.0, "kl": 0.24108801782131195, "learning_rate": 1.2481047860804812e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12072 }, { "completion_length": 522.0, "epoch": 3.3461751662971175, "grad_norm": 0.0, "kl": 0.18918351829051971, "learning_rate": 1.2477258584213433e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12073 }, { "completion_length": 617.75, "epoch": 3.3464523281596454, "grad_norm": 0.0, "kl": 0.1504707634449005, "learning_rate": 1.2473469691642462e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12074 }, { "completion_length": 480.75, "epoch": 3.346729490022173, "grad_norm": 0.765021562576294, "kl": 2.065719427873112e+18, "learning_rate": 1.2469681183208043e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12075 }, { "completion_length": 550.0, "epoch": 3.347006651884701, "grad_norm": 0.0, "kl": 0.22086457908153534, "learning_rate": 1.246589305902639e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12076 }, { "completion_length": 504.5, "epoch": 3.3472838137472283, "grad_norm": 0.4572578966617584, "kl": 0.2167794108390808, "learning_rate": 1.2462105319213643e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12077 }, { "completion_length": 555.25, "epoch": 3.347560975609756, "grad_norm": 0.0, "kl": 0.1928228884935379, "learning_rate": 1.2458317963885983e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12078 }, { "completion_length": 569.0, "epoch": 3.3478381374722836, "grad_norm": 0.0, "kl": 0.18973395228385925, "learning_rate": 1.2454530993159539e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12079 }, { "completion_length": 641.25, "epoch": 3.3481152993348116, "grad_norm": 0.0, "kl": 0.17430105805397034, "learning_rate": 1.2450744407150427e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12080 }, { "completion_length": 562.0, "epoch": 3.3483924611973395, "grad_norm": 0.0, "kl": 0.2146977186203003, "learning_rate": 1.2446958205974797e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12081 }, { "completion_length": 524.75, "epoch": 3.348669623059867, "grad_norm": 0.0, "kl": 0.2453615814447403, "learning_rate": 1.2443172389748735e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12082 }, { "completion_length": 595.0, "epoch": 3.348946784922395, "grad_norm": 6.5049591064453125, "kl": 2.386327853072384e+16, "learning_rate": 1.243938695858834e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12083 }, { "completion_length": 551.25, "epoch": 3.3492239467849223, "grad_norm": 0.0, "kl": 0.18316932022571564, "learning_rate": 1.2435601912609691e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12084 }, { "completion_length": 509.5, "epoch": 3.34950110864745, "grad_norm": 0.0, "kl": 0.23247304558753967, "learning_rate": 1.243181725192886e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12085 }, { "completion_length": 519.0, "epoch": 3.3497782705099777, "grad_norm": 0.0, "kl": 0.19662654399871826, "learning_rate": 1.2428032976661925e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12086 }, { "completion_length": 567.25, "epoch": 3.3500554323725056, "grad_norm": 0.0, "kl": 0.18860965967178345, "learning_rate": 1.2424249086924917e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12087 }, { "completion_length": 569.0, "epoch": 3.3503325942350335, "grad_norm": 0.3881140351295471, "kl": 7.869476299364893e+18, "learning_rate": 1.2420465582833876e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12088 }, { "completion_length": 522.75, "epoch": 3.350609756097561, "grad_norm": 0.0, "kl": 0.23257005214691162, "learning_rate": 1.241668246450482e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12089 }, { "completion_length": 506.0, "epoch": 3.350886917960089, "grad_norm": 0.0, "kl": 0.20749163627624512, "learning_rate": 1.2412899732053774e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12090 }, { "completion_length": 529.0, "epoch": 3.3511640798226163, "grad_norm": 0.0, "kl": 0.23634980618953705, "learning_rate": 1.2409117385596733e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12091 }, { "completion_length": 650.5, "epoch": 3.3514412416851442, "grad_norm": 0.0, "kl": 0.9086363315582275, "learning_rate": 1.2405335425249677e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12092 }, { "completion_length": 493.25, "epoch": 3.3517184035476717, "grad_norm": 0.0, "kl": 0.20591600239276886, "learning_rate": 1.2401553851128598e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12093 }, { "completion_length": 529.25, "epoch": 3.3519955654101996, "grad_norm": 0.0, "kl": 0.5493184924125671, "learning_rate": 1.2397772663349444e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12094 }, { "completion_length": 609.0, "epoch": 3.3522727272727275, "grad_norm": 0.0, "kl": 0.17907215654850006, "learning_rate": 1.2393991862028199e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12095 }, { "completion_length": 561.75, "epoch": 3.352549889135255, "grad_norm": 0.0, "kl": 0.19355714321136475, "learning_rate": 1.2390211447280764e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12096 }, { "completion_length": 526.25, "epoch": 3.352827050997783, "grad_norm": 0.0, "kl": 0.191110298037529, "learning_rate": 1.23864314192231e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12097 }, { "completion_length": 530.0, "epoch": 3.3531042128603104, "grad_norm": 0.0, "kl": 0.17913123965263367, "learning_rate": 1.2382651777971102e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12098 }, { "completion_length": 511.5, "epoch": 3.3533813747228383, "grad_norm": 0.0, "kl": 0.25510647892951965, "learning_rate": 1.2378872523640698e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12099 }, { "completion_length": 558.25, "epoch": 3.3536585365853657, "grad_norm": 0.0, "kl": 0.17184531688690186, "learning_rate": 1.237509365634777e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12100 }, { "completion_length": 479.25, "epoch": 3.3539356984478936, "grad_norm": 0.0, "kl": 0.22438879311084747, "learning_rate": 1.2371315176208193e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12101 }, { "completion_length": 616.25, "epoch": 3.354212860310421, "grad_norm": 0.0, "kl": 0.2086181491613388, "learning_rate": 1.2367537083337856e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12102 }, { "completion_length": 539.25, "epoch": 3.354490022172949, "grad_norm": 0.0, "kl": 0.2008363902568817, "learning_rate": 1.2363759377852603e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12103 }, { "completion_length": 530.5, "epoch": 3.354767184035477, "grad_norm": 0.6152424216270447, "kl": 4.104421656028512e+18, "learning_rate": 1.2359982059868286e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12104 }, { "completion_length": 500.75, "epoch": 3.3550443458980044, "grad_norm": 0.0, "kl": 454869760.0, "learning_rate": 1.2356205129500728e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12105 }, { "completion_length": 503.0, "epoch": 3.3553215077605323, "grad_norm": 0.48983028531074524, "kl": 3.0123608687981363e+18, "learning_rate": 1.2352428586865775e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12106 }, { "completion_length": 472.0, "epoch": 3.3555986696230597, "grad_norm": 0.0, "kl": 1.5815200706459075e+18, "learning_rate": 1.234865243207921e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12107 }, { "completion_length": 486.0, "epoch": 3.3558758314855877, "grad_norm": 0.0, "kl": 0.24923592805862427, "learning_rate": 1.2344876665256862e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12108 }, { "completion_length": 612.75, "epoch": 3.356152993348115, "grad_norm": 0.0, "kl": 0.17438003420829773, "learning_rate": 1.2341101286514497e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12109 }, { "completion_length": 570.75, "epoch": 3.356430155210643, "grad_norm": 0.0, "kl": 0.22222942113876343, "learning_rate": 1.233732629596789e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12110 }, { "completion_length": 634.5, "epoch": 3.3567073170731705, "grad_norm": 0.0, "kl": 0.17157965898513794, "learning_rate": 1.2333551693732817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12111 }, { "completion_length": 500.75, "epoch": 3.3569844789356984, "grad_norm": 0.0, "kl": 0.21598732471466064, "learning_rate": 1.2329777479925023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12112 }, { "completion_length": 791.0, "epoch": 3.3572616407982263, "grad_norm": 0.0, "kl": 0.17518842220306396, "learning_rate": 1.2326003654660248e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12113 }, { "completion_length": 546.5, "epoch": 3.3575388026607538, "grad_norm": 0.0, "kl": 0.24458780884742737, "learning_rate": 1.232223021805421e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12114 }, { "completion_length": 559.5, "epoch": 3.3578159645232817, "grad_norm": 0.0, "kl": 0.21940523386001587, "learning_rate": 1.2318457170222631e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12115 }, { "completion_length": 479.75, "epoch": 3.358093126385809, "grad_norm": 0.0, "kl": 0.20354211330413818, "learning_rate": 1.2314684511281224e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12116 }, { "completion_length": 618.5, "epoch": 3.358370288248337, "grad_norm": 0.0, "kl": 0.16551974415779114, "learning_rate": 1.2310912241345676e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12117 }, { "completion_length": 507.25, "epoch": 3.3586474501108645, "grad_norm": 9.560382843017578, "kl": 656318650122240.0, "learning_rate": 1.2307140360531666e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12118 }, { "completion_length": 558.75, "epoch": 3.3589246119733924, "grad_norm": 0.0, "kl": 0.21699939668178558, "learning_rate": 1.2303368868954848e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12119 }, { "completion_length": 554.75, "epoch": 3.3592017738359203, "grad_norm": 0.0, "kl": 0.19876441359519958, "learning_rate": 1.2299597766730904e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12120 }, { "completion_length": 560.5, "epoch": 3.359478935698448, "grad_norm": 0.0, "kl": 0.2015865594148636, "learning_rate": 1.229582705397546e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12121 }, { "completion_length": 531.5, "epoch": 3.3597560975609757, "grad_norm": 0.0, "kl": 1.4586506083146138e+19, "learning_rate": 1.229205673080414e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12122 }, { "completion_length": 720.75, "epoch": 3.360033259423503, "grad_norm": 0.40832430124282837, "kl": 2.5046901269016347e+19, "learning_rate": 1.228828679733259e-06, "loss": 0.0, "reward": 4.5625, "reward_std": 1.9080421924591064, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.5625, "step": 12123 }, { "completion_length": 469.0, "epoch": 3.360310421286031, "grad_norm": 0.0, "kl": 0.21088044345378876, "learning_rate": 1.2284517253676395e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12124 }, { "completion_length": 576.0, "epoch": 3.3605875831485585, "grad_norm": 0.0, "kl": 0.19078770279884338, "learning_rate": 1.228074809995118e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12125 }, { "completion_length": 565.5, "epoch": 3.3608647450110865, "grad_norm": 0.0, "kl": 2.877961350294379e+19, "learning_rate": 1.227697933627249e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12126 }, { "completion_length": 491.0, "epoch": 3.3611419068736144, "grad_norm": 0.0, "kl": 0.2335207164287567, "learning_rate": 1.2273210962755927e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12127 }, { "completion_length": 565.25, "epoch": 3.361419068736142, "grad_norm": 0.0, "kl": 0.23865608870983124, "learning_rate": 1.2269442979517033e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12128 }, { "completion_length": 593.5, "epoch": 3.3616962305986697, "grad_norm": 0.0, "kl": 9.548127174377441, "learning_rate": 1.2265675386671371e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12129 }, { "completion_length": 518.75, "epoch": 3.361973392461197, "grad_norm": 0.42902806401252747, "kl": 0.18807454407215118, "learning_rate": 1.2261908184334475e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12130 }, { "completion_length": 533.25, "epoch": 3.362250554323725, "grad_norm": 0.0, "kl": 0.19032259285449982, "learning_rate": 1.2258141372621854e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12131 }, { "completion_length": 485.75, "epoch": 3.3625277161862526, "grad_norm": 0.0, "kl": 0.2316848635673523, "learning_rate": 1.2254374951649043e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12132 }, { "completion_length": 585.75, "epoch": 3.3628048780487805, "grad_norm": 0.0, "kl": 0.1796722263097763, "learning_rate": 1.2250608921531532e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12133 }, { "completion_length": 517.75, "epoch": 3.3630820399113084, "grad_norm": 0.5521834492683411, "kl": 5.79654129894671e+19, "learning_rate": 1.224684328238481e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12134 }, { "completion_length": 549.5, "epoch": 3.363359201773836, "grad_norm": 0.0, "kl": 0.17150647938251495, "learning_rate": 1.2243078034324341e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12135 }, { "completion_length": 614.75, "epoch": 3.3636363636363638, "grad_norm": 0.0, "kl": 0.19898775219917297, "learning_rate": 1.2239313177465611e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12136 }, { "completion_length": 541.0, "epoch": 3.363913525498891, "grad_norm": 0.0, "kl": 0.16481178998947144, "learning_rate": 1.2235548711924056e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12137 }, { "completion_length": 606.75, "epoch": 3.364190687361419, "grad_norm": 0.0, "kl": 0.16813987493515015, "learning_rate": 1.2231784637815132e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12138 }, { "completion_length": 622.75, "epoch": 3.3644678492239466, "grad_norm": 2.6466994285583496, "kl": 2571.24951171875, "learning_rate": 1.2228020955254256e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12139 }, { "completion_length": 463.75, "epoch": 3.3647450110864745, "grad_norm": 0.0, "kl": 0.2353878766298294, "learning_rate": 1.222425766435684e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12140 }, { "completion_length": 497.5, "epoch": 3.3650221729490024, "grad_norm": 0.0, "kl": 0.23535385727882385, "learning_rate": 1.2220494765238302e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12141 }, { "completion_length": 490.0, "epoch": 3.36529933481153, "grad_norm": 0.0, "kl": 0.20630909502506256, "learning_rate": 1.2216732258014033e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12142 }, { "completion_length": 550.0, "epoch": 3.365576496674058, "grad_norm": 0.0, "kl": 0.17671383917331696, "learning_rate": 1.2212970142799404e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12143 }, { "completion_length": 563.0, "epoch": 3.3658536585365852, "grad_norm": 0.0, "kl": 0.18442419171333313, "learning_rate": 1.2209208419709782e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12144 }, { "completion_length": 581.5, "epoch": 3.366130820399113, "grad_norm": 0.0, "kl": 0.19191132485866547, "learning_rate": 1.2205447088860528e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12145 }, { "completion_length": 544.75, "epoch": 3.3664079822616406, "grad_norm": 0.0, "kl": 0.18863101303577423, "learning_rate": 1.2201686150367006e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12146 }, { "completion_length": 477.25, "epoch": 3.3666851441241685, "grad_norm": 0.0, "kl": 0.22418636083602905, "learning_rate": 1.2197925604344513e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12147 }, { "completion_length": 528.5, "epoch": 3.3669623059866964, "grad_norm": 0.5879452228546143, "kl": 0.19158942997455597, "learning_rate": 1.2194165450908393e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12148 }, { "completion_length": 558.5, "epoch": 3.367239467849224, "grad_norm": 0.0, "kl": 0.20479454100131989, "learning_rate": 1.219040569017394e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12149 }, { "completion_length": 526.0, "epoch": 3.367516629711752, "grad_norm": 0.42662447690963745, "kl": 0.27326321601867676, "learning_rate": 1.2186646322256467e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12150 }, { "completion_length": 562.25, "epoch": 3.3677937915742793, "grad_norm": 0.0, "kl": 0.21507588028907776, "learning_rate": 1.218288734727125e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12151 }, { "completion_length": 508.0, "epoch": 3.368070953436807, "grad_norm": 0.0, "kl": 0.27319401502609253, "learning_rate": 1.217912876533355e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12152 }, { "completion_length": 442.5, "epoch": 3.3683481152993346, "grad_norm": 8.914584159851074, "kl": 90557674160128.0, "learning_rate": 1.2175370576558649e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12153 }, { "completion_length": 602.25, "epoch": 3.3686252771618626, "grad_norm": 0.0, "kl": 0.1676427125930786, "learning_rate": 1.217161278106177e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12154 }, { "completion_length": 577.25, "epoch": 3.3689024390243905, "grad_norm": 0.0, "kl": 0.261688232421875, "learning_rate": 1.2167855378958181e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12155 }, { "completion_length": 513.75, "epoch": 3.369179600886918, "grad_norm": 0.775585949420929, "kl": 4.207722972481323e+18, "learning_rate": 1.2164098370363073e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12156 }, { "completion_length": 462.5, "epoch": 3.369456762749446, "grad_norm": 0.0, "kl": 0.2032087743282318, "learning_rate": 1.2160341755391677e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12157 }, { "completion_length": 566.5, "epoch": 3.3697339246119733, "grad_norm": 0.0, "kl": 0.17094656825065613, "learning_rate": 1.215658553415918e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12158 }, { "completion_length": 449.25, "epoch": 3.370011086474501, "grad_norm": 0.0, "kl": 0.23278389871120453, "learning_rate": 1.2152829706780786e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12159 }, { "completion_length": 561.0, "epoch": 3.3702882483370287, "grad_norm": 0.0, "kl": 0.17510400712490082, "learning_rate": 1.2149074273371663e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12160 }, { "completion_length": 587.5, "epoch": 3.3705654101995566, "grad_norm": 0.0, "kl": 0.18280941247940063, "learning_rate": 1.2145319234046963e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12161 }, { "completion_length": 545.0, "epoch": 3.3708425720620845, "grad_norm": 0.0, "kl": 0.4347224533557892, "learning_rate": 1.214156458892186e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12162 }, { "completion_length": 628.75, "epoch": 3.371119733924612, "grad_norm": 0.0, "kl": 0.1831922084093094, "learning_rate": 1.2137810338111478e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12163 }, { "completion_length": 543.0, "epoch": 3.37139689578714, "grad_norm": 0.0, "kl": 0.1837858259677887, "learning_rate": 1.2134056481730948e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12164 }, { "completion_length": 874.5, "epoch": 3.3716740576496673, "grad_norm": 0.43041473627090454, "kl": 0.20002537965774536, "learning_rate": 1.2130303019895375e-06, "loss": 0.0, "reward": 4.0625, "reward_std": 3.375, "rewards/confident_score_func": 1.25, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.375, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.5625, "step": 12165 }, { "completion_length": 578.5, "epoch": 3.3719512195121952, "grad_norm": 0.42118096351623535, "kl": 2.789261328356108e+19, "learning_rate": 1.2126549952719886e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12166 }, { "completion_length": 494.5, "epoch": 3.3722283813747227, "grad_norm": 0.0, "kl": 0.22087475657463074, "learning_rate": 1.2122797280319543e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12167 }, { "completion_length": 501.25, "epoch": 3.3725055432372506, "grad_norm": 0.0, "kl": 0.5085064172744751, "learning_rate": 1.211904500280945e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12168 }, { "completion_length": 609.25, "epoch": 3.3727827050997785, "grad_norm": 0.0, "kl": 0.17801021039485931, "learning_rate": 1.2115293120304663e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12169 }, { "completion_length": 493.5, "epoch": 3.373059866962306, "grad_norm": 0.0, "kl": 0.1840055137872696, "learning_rate": 1.2111541632920229e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12170 }, { "completion_length": 595.25, "epoch": 3.373337028824834, "grad_norm": 0.0, "kl": 0.2053312510251999, "learning_rate": 1.2107790540771208e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12171 }, { "completion_length": 565.25, "epoch": 3.3736141906873613, "grad_norm": 0.0, "kl": 0.1938220113515854, "learning_rate": 1.210403984397262e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12172 }, { "completion_length": 584.5, "epoch": 3.3738913525498893, "grad_norm": 0.0, "kl": 0.25008320808410645, "learning_rate": 1.2100289542639484e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12173 }, { "completion_length": 662.25, "epoch": 3.3741685144124167, "grad_norm": 0.0, "kl": 0.18505392968654633, "learning_rate": 1.2096539636886797e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12174 }, { "completion_length": 547.5, "epoch": 3.3744456762749446, "grad_norm": 5.037919044494629, "kl": 609550415691776.0, "learning_rate": 1.2092790126829562e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12175 }, { "completion_length": 555.0, "epoch": 3.374722838137472, "grad_norm": 0.0, "kl": 0.192377507686615, "learning_rate": 1.208904101258278e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12176 }, { "completion_length": 595.25, "epoch": 3.375, "grad_norm": 0.0, "kl": 0.5335884690284729, "learning_rate": 1.2085292294261382e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12177 }, { "completion_length": 504.25, "epoch": 3.375277161862528, "grad_norm": 0.0, "kl": 0.1745365709066391, "learning_rate": 1.2081543971980359e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12178 }, { "completion_length": 587.0, "epoch": 3.3755543237250554, "grad_norm": 0.0, "kl": 0.18610860407352448, "learning_rate": 1.207779604585463e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12179 }, { "completion_length": 566.25, "epoch": 3.3758314855875833, "grad_norm": 0.0, "kl": 0.20585833489894867, "learning_rate": 1.207404851599915e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12180 }, { "completion_length": 425.25, "epoch": 3.3761086474501107, "grad_norm": 1.198941707611084, "kl": 4.889311055173386e+18, "learning_rate": 1.2070301382528835e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12181 }, { "completion_length": 549.25, "epoch": 3.3763858093126387, "grad_norm": 0.0, "kl": 0.28096699714660645, "learning_rate": 1.2066554645558578e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12182 }, { "completion_length": 528.75, "epoch": 3.376662971175166, "grad_norm": 0.0, "kl": 0.42918661236763, "learning_rate": 1.20628083052033e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12183 }, { "completion_length": 549.75, "epoch": 3.376940133037694, "grad_norm": 0.0, "kl": 0.21699024736881256, "learning_rate": 1.2059062361577871e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12184 }, { "completion_length": 574.25, "epoch": 3.3772172949002215, "grad_norm": 0.0, "kl": 0.21984606981277466, "learning_rate": 1.2055316814797169e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12185 }, { "completion_length": 540.0, "epoch": 3.3774944567627494, "grad_norm": 0.0, "kl": 0.22500893473625183, "learning_rate": 1.205157166497604e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12186 }, { "completion_length": 511.25, "epoch": 3.3777716186252773, "grad_norm": 0.0, "kl": 0.22233518958091736, "learning_rate": 1.2047826912229354e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12187 }, { "completion_length": 486.25, "epoch": 3.3780487804878048, "grad_norm": 0.0, "kl": 0.4628068208694458, "learning_rate": 1.2044082556671929e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12188 }, { "completion_length": 598.0, "epoch": 3.3783259423503327, "grad_norm": 0.7383329272270203, "kl": 955551040.0, "learning_rate": 1.2040338598418605e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12189 }, { "completion_length": 537.75, "epoch": 3.37860310421286, "grad_norm": 0.0, "kl": 1.6557048073667215e+18, "learning_rate": 1.2036595037584187e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12190 }, { "completion_length": 584.75, "epoch": 3.378880266075388, "grad_norm": 0.42994022369384766, "kl": 153073696768.0, "learning_rate": 1.203285187428346e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12191 }, { "completion_length": 513.5, "epoch": 3.3791574279379155, "grad_norm": 0.0, "kl": 0.1951417773962021, "learning_rate": 1.2029109108631237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12192 }, { "completion_length": 510.25, "epoch": 3.3794345898004434, "grad_norm": 0.0, "kl": 0.23518195748329163, "learning_rate": 1.2025366740742278e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12193 }, { "completion_length": 517.25, "epoch": 3.3797117516629713, "grad_norm": 0.0, "kl": 0.21779027581214905, "learning_rate": 1.2021624770731346e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12194 }, { "completion_length": 507.25, "epoch": 3.379988913525499, "grad_norm": 0.0, "kl": 0.19184015691280365, "learning_rate": 1.2017883198713188e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12195 }, { "completion_length": 620.5, "epoch": 3.3802660753880267, "grad_norm": 0.0, "kl": 0.17185324430465698, "learning_rate": 1.2014142024802555e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12196 }, { "completion_length": 442.0, "epoch": 3.380543237250554, "grad_norm": 0.0, "kl": 0.2318214476108551, "learning_rate": 1.2010401249114166e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12197 }, { "completion_length": 496.25, "epoch": 3.380820399113082, "grad_norm": 0.0, "kl": 0.2181038111448288, "learning_rate": 1.200666087176273e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12198 }, { "completion_length": 524.75, "epoch": 3.3810975609756095, "grad_norm": 0.38429754972457886, "kl": 8.757763167254426e+19, "learning_rate": 1.200292089286296e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12199 }, { "completion_length": 585.75, "epoch": 3.3813747228381374, "grad_norm": 1.1407577991485596, "kl": 2517801984.0, "learning_rate": 1.1999181312529532e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12200 }, { "completion_length": 509.5, "epoch": 3.3816518847006654, "grad_norm": 0.0, "kl": 0.20613056421279907, "learning_rate": 1.1995442130877137e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12201 }, { "completion_length": 492.5, "epoch": 3.381929046563193, "grad_norm": 0.0, "kl": 0.24713511765003204, "learning_rate": 1.1991703348020437e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12202 }, { "completion_length": 457.5, "epoch": 3.3822062084257207, "grad_norm": 0.0, "kl": 0.1890283077955246, "learning_rate": 1.198796496407408e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12203 }, { "completion_length": 523.0, "epoch": 3.382483370288248, "grad_norm": 26.099952697753906, "kl": 157005696.0, "learning_rate": 1.1984226979152702e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12204 }, { "completion_length": 568.25, "epoch": 3.382760532150776, "grad_norm": 0.0, "kl": 0.3659871518611908, "learning_rate": 1.1980489393370939e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12205 }, { "completion_length": 536.25, "epoch": 3.3830376940133036, "grad_norm": 0.0, "kl": 0.4107375144958496, "learning_rate": 1.1976752206843424e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12206 }, { "completion_length": 570.0, "epoch": 3.3833148558758315, "grad_norm": 0.0, "kl": 0.17201490700244904, "learning_rate": 1.1973015419684724e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12207 }, { "completion_length": 617.5, "epoch": 3.3835920177383594, "grad_norm": 0.0, "kl": 0.19142119586467743, "learning_rate": 1.1969279032009459e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12208 }, { "completion_length": 585.0, "epoch": 3.383869179600887, "grad_norm": 0.0, "kl": 0.18893224000930786, "learning_rate": 1.1965543043932193e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12209 }, { "completion_length": 534.5, "epoch": 3.3841463414634148, "grad_norm": 0.0, "kl": 0.2751559019088745, "learning_rate": 1.1961807455567505e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12210 }, { "completion_length": 552.5, "epoch": 3.384423503325942, "grad_norm": 0.0, "kl": 0.22648963332176208, "learning_rate": 1.195807226702995e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12211 }, { "completion_length": 546.0, "epoch": 3.38470066518847, "grad_norm": 0.0, "kl": 0.213863343000412, "learning_rate": 1.1954337478434053e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12212 }, { "completion_length": 468.75, "epoch": 3.3849778270509976, "grad_norm": 0.0, "kl": 0.2317948192358017, "learning_rate": 1.1950603089894366e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12213 }, { "completion_length": 576.25, "epoch": 3.3852549889135255, "grad_norm": 0.0, "kl": 0.165884867310524, "learning_rate": 1.19468691015254e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12214 }, { "completion_length": 558.0, "epoch": 3.3855321507760534, "grad_norm": 0.0, "kl": 0.19443155825138092, "learning_rate": 1.1943135513441657e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12215 }, { "completion_length": 636.0, "epoch": 3.385809312638581, "grad_norm": 0.0, "kl": 0.17814630270004272, "learning_rate": 1.1939402325757623e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12216 }, { "completion_length": 564.5, "epoch": 3.386086474501109, "grad_norm": 0.0, "kl": 0.19489479064941406, "learning_rate": 1.19356695385878e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12217 }, { "completion_length": 547.75, "epoch": 3.3863636363636362, "grad_norm": 0.0, "kl": 0.20061254501342773, "learning_rate": 1.1931937152046635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12218 }, { "completion_length": 637.0, "epoch": 3.386640798226164, "grad_norm": 0.0, "kl": 0.19999372959136963, "learning_rate": 1.1928205166248606e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12219 }, { "completion_length": 484.75, "epoch": 3.3869179600886916, "grad_norm": 0.0, "kl": 0.22499676048755646, "learning_rate": 1.1924473581308146e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12220 }, { "completion_length": 581.0, "epoch": 3.3871951219512195, "grad_norm": 0.0, "kl": 0.18717914819717407, "learning_rate": 1.192074239733968e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12221 }, { "completion_length": 530.25, "epoch": 3.3874722838137474, "grad_norm": 0.0, "kl": 1.0992385090876604e+20, "learning_rate": 1.1917011614457647e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12222 }, { "completion_length": 556.0, "epoch": 3.387749445676275, "grad_norm": 0.0, "kl": 0.26562491059303284, "learning_rate": 1.1913281232776445e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12223 }, { "completion_length": 537.0, "epoch": 3.388026607538803, "grad_norm": 0.0, "kl": 0.18422827124595642, "learning_rate": 1.1909551252410464e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12224 }, { "completion_length": 486.25, "epoch": 3.3883037694013303, "grad_norm": 0.0, "kl": 0.2170262187719345, "learning_rate": 1.1905821673474083e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12225 }, { "completion_length": 556.5, "epoch": 3.388580931263858, "grad_norm": 0.0, "kl": 0.24193689227104187, "learning_rate": 1.190209249608169e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12226 }, { "completion_length": 572.75, "epoch": 3.3888580931263856, "grad_norm": 0.0, "kl": 0.18573932349681854, "learning_rate": 1.1898363720347635e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12227 }, { "completion_length": 561.75, "epoch": 3.3891352549889135, "grad_norm": 0.43412819504737854, "kl": 7.343834424741462e+18, "learning_rate": 1.1894635346386254e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12228 }, { "completion_length": 540.5, "epoch": 3.3894124168514415, "grad_norm": 0.0, "kl": 0.16099633276462555, "learning_rate": 1.1890907374311903e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12229 }, { "completion_length": 625.25, "epoch": 3.389689578713969, "grad_norm": 1.4925068616867065, "kl": 3.2793893622314435e+18, "learning_rate": 1.1887179804238876e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12230 }, { "completion_length": 569.75, "epoch": 3.389966740576497, "grad_norm": 0.0, "kl": 0.19274896383285522, "learning_rate": 1.1883452636281506e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12231 }, { "completion_length": 519.25, "epoch": 3.3902439024390243, "grad_norm": 0.0, "kl": 0.20058482885360718, "learning_rate": 1.1879725870554081e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12232 }, { "completion_length": 578.0, "epoch": 3.390521064301552, "grad_norm": 0.6583683490753174, "kl": 1.1087419839107144e+20, "learning_rate": 1.1875999507170889e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12233 }, { "completion_length": 533.75, "epoch": 3.3907982261640797, "grad_norm": 0.4599848687648773, "kl": 68488384512.0, "learning_rate": 1.1872273546246181e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12234 }, { "completion_length": 538.25, "epoch": 3.3910753880266076, "grad_norm": 0.0, "kl": 0.23331260681152344, "learning_rate": 1.1868547987894247e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12235 }, { "completion_length": 571.5, "epoch": 3.3913525498891355, "grad_norm": 0.0, "kl": 0.1866818070411682, "learning_rate": 1.1864822832229319e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12236 }, { "completion_length": 474.5, "epoch": 3.391629711751663, "grad_norm": 0.0, "kl": 0.1867736577987671, "learning_rate": 1.1861098079365627e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12237 }, { "completion_length": 627.0, "epoch": 3.391906873614191, "grad_norm": 0.3626020848751068, "kl": 0.13841331005096436, "learning_rate": 1.1857373729417407e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12238 }, { "completion_length": 526.75, "epoch": 3.3921840354767183, "grad_norm": 0.0, "kl": 0.3026205897331238, "learning_rate": 1.1853649782498852e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12239 }, { "completion_length": 519.5, "epoch": 3.3924611973392462, "grad_norm": 0.4520729184150696, "kl": 0.2020712047815323, "learning_rate": 1.1849926238724193e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12240 }, { "completion_length": 544.25, "epoch": 3.3927383592017737, "grad_norm": 0.0, "kl": 0.2089146375656128, "learning_rate": 1.1846203098207573e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12241 }, { "completion_length": 518.25, "epoch": 3.3930155210643016, "grad_norm": 0.0, "kl": 0.18279768526554108, "learning_rate": 1.1842480361063185e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12242 }, { "completion_length": 650.25, "epoch": 3.3932926829268295, "grad_norm": 0.3336736559867859, "kl": 3.3256818254177894e+19, "learning_rate": 1.1838758027405201e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12243 }, { "completion_length": 567.0, "epoch": 3.393569844789357, "grad_norm": 0.0, "kl": 0.19570264220237732, "learning_rate": 1.183503609734776e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12244 }, { "completion_length": 540.0, "epoch": 3.393847006651885, "grad_norm": 0.0, "kl": 0.20662595331668854, "learning_rate": 1.1831314571004996e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12245 }, { "completion_length": 559.75, "epoch": 3.3941241685144123, "grad_norm": 0.0, "kl": 4.7087359515693154e+19, "learning_rate": 1.1827593448491023e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12246 }, { "completion_length": 560.0, "epoch": 3.3944013303769403, "grad_norm": 0.0, "kl": 0.2092343419790268, "learning_rate": 1.1823872729919972e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12247 }, { "completion_length": 633.5, "epoch": 3.3946784922394677, "grad_norm": 0.0, "kl": 0.19272677600383759, "learning_rate": 1.1820152415405936e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12248 }, { "completion_length": 547.5, "epoch": 3.3949556541019956, "grad_norm": 0.4480910301208496, "kl": 4.512996933350772e+19, "learning_rate": 1.1816432505062985e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12249 }, { "completion_length": 480.5, "epoch": 3.395232815964523, "grad_norm": 0.0, "kl": 0.19959571957588196, "learning_rate": 1.1812712999005222e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12250 }, { "completion_length": 590.5, "epoch": 3.395509977827051, "grad_norm": 0.0, "kl": 0.18865662813186646, "learning_rate": 1.1808993897346679e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12251 }, { "completion_length": 505.0, "epoch": 3.395787139689579, "grad_norm": 0.0, "kl": 0.16743426024913788, "learning_rate": 1.1805275200201433e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12252 }, { "completion_length": 570.0, "epoch": 3.3960643015521064, "grad_norm": 0.0, "kl": 0.19483865797519684, "learning_rate": 1.1801556907683503e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12253 }, { "completion_length": 500.75, "epoch": 3.3963414634146343, "grad_norm": 0.0, "kl": 2.2425005435943604, "learning_rate": 1.1797839019906923e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12254 }, { "completion_length": 568.0, "epoch": 3.3966186252771617, "grad_norm": 0.0, "kl": 0.2036312073469162, "learning_rate": 1.1794121536985688e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12255 }, { "completion_length": 495.5, "epoch": 3.3968957871396896, "grad_norm": 0.0, "kl": 0.1811317503452301, "learning_rate": 1.1790404459033824e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12256 }, { "completion_length": 466.25, "epoch": 3.397172949002217, "grad_norm": 1.5743006467819214, "kl": 3.6625367838939415e+19, "learning_rate": 1.1786687786165302e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12257 }, { "completion_length": 565.75, "epoch": 3.397450110864745, "grad_norm": 0.0, "kl": 0.281831830739975, "learning_rate": 1.178297151849409e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12258 }, { "completion_length": 562.25, "epoch": 3.3977272727272725, "grad_norm": 0.4772768020629883, "kl": 3.0625445036351816e+18, "learning_rate": 1.1779255656134169e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12259 }, { "completion_length": 580.0, "epoch": 3.3980044345898004, "grad_norm": 13.52480411529541, "kl": 52598.48046875, "learning_rate": 1.1775540199199474e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12260 }, { "completion_length": 420.75, "epoch": 3.3982815964523283, "grad_norm": 0.0, "kl": 0.2200380116701126, "learning_rate": 1.1771825147803956e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12261 }, { "completion_length": 500.0, "epoch": 3.3985587583148558, "grad_norm": 0.0, "kl": 0.2034323811531067, "learning_rate": 1.1768110502061534e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12262 }, { "completion_length": 523.75, "epoch": 3.3988359201773837, "grad_norm": 0.0, "kl": 0.22717167437076569, "learning_rate": 1.1764396262086118e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12263 }, { "completion_length": 547.75, "epoch": 3.399113082039911, "grad_norm": 0.0, "kl": 0.19421660900115967, "learning_rate": 1.17606824279916e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12264 }, { "completion_length": 550.25, "epoch": 3.399390243902439, "grad_norm": 0.0, "kl": 0.19832660257816315, "learning_rate": 1.1756968999891887e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12265 }, { "completion_length": 497.5, "epoch": 3.3996674057649665, "grad_norm": 0.0, "kl": 0.1628730595111847, "learning_rate": 1.1753255977900846e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12266 }, { "completion_length": 459.25, "epoch": 3.3999445676274944, "grad_norm": 0.0, "kl": 0.22748859226703644, "learning_rate": 1.1749543362132328e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12267 }, { "completion_length": 518.0, "epoch": 3.4002217294900223, "grad_norm": 0.0, "kl": 0.19012698531150818, "learning_rate": 1.1745831152700208e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12268 }, { "completion_length": 548.0, "epoch": 3.40049889135255, "grad_norm": 0.0, "kl": 9076051968.0, "learning_rate": 1.1742119349718297e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12269 }, { "completion_length": 627.5, "epoch": 3.4007760532150777, "grad_norm": 0.0, "kl": 1.3787609180741304e+18, "learning_rate": 1.1738407953300457e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12270 }, { "completion_length": 512.0, "epoch": 3.401053215077605, "grad_norm": 0.0, "kl": 0.2529038190841675, "learning_rate": 1.1734696963560457e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12271 }, { "completion_length": 497.0, "epoch": 3.401330376940133, "grad_norm": 0.0, "kl": 0.24828284978866577, "learning_rate": 1.1730986380612119e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12272 }, { "completion_length": 615.25, "epoch": 3.4016075388026605, "grad_norm": 0.0, "kl": 0.18798086047172546, "learning_rate": 1.1727276204569241e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12273 }, { "completion_length": 568.0, "epoch": 3.4018847006651884, "grad_norm": 0.0, "kl": 0.190863698720932, "learning_rate": 1.1723566435545586e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12274 }, { "completion_length": 499.25, "epoch": 3.4021618625277164, "grad_norm": 0.0, "kl": 1121177088.0, "learning_rate": 1.1719857073654923e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12275 }, { "completion_length": 641.5, "epoch": 3.402439024390244, "grad_norm": 0.0, "kl": 0.18531957268714905, "learning_rate": 1.171614811901099e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12276 }, { "completion_length": 532.5, "epoch": 3.4027161862527717, "grad_norm": 0.0, "kl": 0.22240997850894928, "learning_rate": 1.171243957172754e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12277 }, { "completion_length": 518.0, "epoch": 3.402993348115299, "grad_norm": 0.42198604345321655, "kl": 0.21289579570293427, "learning_rate": 1.1708731431918298e-06, "loss": -0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12278 }, { "completion_length": 496.25, "epoch": 3.403270509977827, "grad_norm": 0.0, "kl": 0.20664462447166443, "learning_rate": 1.1705023699696963e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12279 }, { "completion_length": 545.75, "epoch": 3.4035476718403546, "grad_norm": 0.0, "kl": 0.20332303643226624, "learning_rate": 1.170131637517725e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12280 }, { "completion_length": 546.5, "epoch": 3.4038248337028825, "grad_norm": 0.0, "kl": 0.20291581749916077, "learning_rate": 1.169760945847284e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12281 }, { "completion_length": 481.5, "epoch": 3.4041019955654104, "grad_norm": 0.0, "kl": 0.23840560019016266, "learning_rate": 1.1693902949697417e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12282 }, { "completion_length": 513.75, "epoch": 3.404379157427938, "grad_norm": 0.0, "kl": 0.20296746492385864, "learning_rate": 1.1690196848964639e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12283 }, { "completion_length": 511.25, "epoch": 3.4046563192904657, "grad_norm": 0.0, "kl": 0.20010216534137726, "learning_rate": 1.1686491156388158e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12284 }, { "completion_length": 572.25, "epoch": 3.404933481152993, "grad_norm": 0.0, "kl": 0.1897364854812622, "learning_rate": 1.1682785872081599e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12285 }, { "completion_length": 527.0, "epoch": 3.405210643015521, "grad_norm": 0.0, "kl": 0.17689895629882812, "learning_rate": 1.1679080996158608e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12286 }, { "completion_length": 560.25, "epoch": 3.4054878048780486, "grad_norm": 0.0, "kl": 0.2211986631155014, "learning_rate": 1.167537652873279e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12287 }, { "completion_length": 530.5, "epoch": 3.4057649667405765, "grad_norm": 0.48442795872688293, "kl": 6.7845422166522724e+19, "learning_rate": 1.1671672469917735e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12288 }, { "completion_length": 562.0, "epoch": 3.4060421286031044, "grad_norm": 0.0, "kl": 0.24000602960586548, "learning_rate": 1.1667968819827052e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12289 }, { "completion_length": 578.75, "epoch": 3.406319290465632, "grad_norm": 0.0, "kl": 0.19172704219818115, "learning_rate": 1.1664265578574293e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12290 }, { "completion_length": 541.0, "epoch": 3.4065964523281598, "grad_norm": 0.6701456904411316, "kl": 1.0525763626042655e+18, "learning_rate": 1.1660562746273055e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12291 }, { "completion_length": 525.5, "epoch": 3.4068736141906872, "grad_norm": 0.0, "kl": 0.19658680260181427, "learning_rate": 1.1656860323036847e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12292 }, { "completion_length": 532.5, "epoch": 3.407150776053215, "grad_norm": 0.0, "kl": 0.21684762835502625, "learning_rate": 1.1653158308979237e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12293 }, { "completion_length": 526.25, "epoch": 3.4074279379157426, "grad_norm": 0.0, "kl": 0.18695689737796783, "learning_rate": 1.164945670421373e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12294 }, { "completion_length": 591.0, "epoch": 3.4077050997782705, "grad_norm": 0.0, "kl": 0.17997032403945923, "learning_rate": 1.1645755508853858e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12295 }, { "completion_length": 502.25, "epoch": 3.4079822616407984, "grad_norm": 0.0, "kl": 0.161879301071167, "learning_rate": 1.1642054723013114e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12296 }, { "completion_length": 499.25, "epoch": 3.408259423503326, "grad_norm": 0.515423595905304, "kl": 4.126957961033967e+19, "learning_rate": 1.1638354346804974e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12297 }, { "completion_length": 523.75, "epoch": 3.408536585365854, "grad_norm": 0.0, "kl": 0.18603572249412537, "learning_rate": 1.163465438034293e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12298 }, { "completion_length": 571.25, "epoch": 3.4088137472283813, "grad_norm": 0.453505277633667, "kl": 7454061056.0, "learning_rate": 1.163095482374043e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12299 }, { "completion_length": 508.25, "epoch": 3.409090909090909, "grad_norm": 0.0, "kl": 0.229561448097229, "learning_rate": 1.1627255677110952e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12300 }, { "completion_length": 608.75, "epoch": 3.4093680709534366, "grad_norm": 0.6938009262084961, "kl": 974416640.0, "learning_rate": 1.1623556940567892e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12301 }, { "completion_length": 631.0, "epoch": 3.4096452328159645, "grad_norm": 0.0, "kl": 0.163625106215477, "learning_rate": 1.1619858614224697e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12302 }, { "completion_length": 546.5, "epoch": 3.4099223946784925, "grad_norm": 0.0, "kl": 0.19042642414569855, "learning_rate": 1.161616069819479e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12303 }, { "completion_length": 562.25, "epoch": 3.41019955654102, "grad_norm": 0.0, "kl": 0.19579686224460602, "learning_rate": 1.1612463192591556e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12304 }, { "completion_length": 552.0, "epoch": 3.410476718403548, "grad_norm": 1.8832015991210938, "kl": 766790464.0, "learning_rate": 1.1608766097528384e-06, "loss": 0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12305 }, { "completion_length": 643.5, "epoch": 3.4107538802660753, "grad_norm": 0.0, "kl": 0.1611536741256714, "learning_rate": 1.1605069413118644e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12306 }, { "completion_length": 537.25, "epoch": 3.411031042128603, "grad_norm": 0.8739385008811951, "kl": 7.280764238748975e+18, "learning_rate": 1.1601373139475712e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12307 }, { "completion_length": 560.0, "epoch": 3.4113082039911307, "grad_norm": 0.0, "kl": 0.17631511390209198, "learning_rate": 1.1597677276712927e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12308 }, { "completion_length": 577.0, "epoch": 3.4115853658536586, "grad_norm": 0.0, "kl": 0.21944957971572876, "learning_rate": 1.1593981824943617e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12309 }, { "completion_length": 506.5, "epoch": 3.4118625277161865, "grad_norm": 0.0, "kl": 1173395456.0, "learning_rate": 1.159028678428113e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12310 }, { "completion_length": 624.0, "epoch": 3.412139689578714, "grad_norm": 0.0, "kl": 0.1799698770046234, "learning_rate": 1.1586592154838752e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12311 }, { "completion_length": 575.0, "epoch": 3.412416851441242, "grad_norm": 0.0, "kl": 0.22559472918510437, "learning_rate": 1.1582897936729805e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12312 }, { "completion_length": 590.75, "epoch": 3.4126940133037693, "grad_norm": 0.0, "kl": 0.204609677195549, "learning_rate": 1.1579204130067564e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12313 }, { "completion_length": 536.0, "epoch": 3.412971175166297, "grad_norm": 0.0, "kl": 0.20090226829051971, "learning_rate": 1.1575510734965305e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12314 }, { "completion_length": 518.0, "epoch": 3.4132483370288247, "grad_norm": 0.0, "kl": 0.17049631476402283, "learning_rate": 1.1571817751536276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12315 }, { "completion_length": 455.0, "epoch": 3.4135254988913526, "grad_norm": 0.0, "kl": 0.22282350063323975, "learning_rate": 1.1568125179893746e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12316 }, { "completion_length": 591.75, "epoch": 3.4138026607538805, "grad_norm": 0.0, "kl": 0.17425772547721863, "learning_rate": 1.1564433020150946e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12317 }, { "completion_length": 514.25, "epoch": 3.414079822616408, "grad_norm": 0.0, "kl": 0.18713805079460144, "learning_rate": 1.1560741272421077e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12318 }, { "completion_length": 541.75, "epoch": 3.414356984478936, "grad_norm": 0.0, "kl": 0.2303178757429123, "learning_rate": 1.1557049936817383e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12319 }, { "completion_length": 545.75, "epoch": 3.4146341463414633, "grad_norm": 0.0, "kl": 0.1975700557231903, "learning_rate": 1.1553359013453036e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12320 }, { "completion_length": 539.75, "epoch": 3.4149113082039912, "grad_norm": 0.0, "kl": 0.19370679557323456, "learning_rate": 1.1549668502441246e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12321 }, { "completion_length": 476.5, "epoch": 3.4151884700665187, "grad_norm": 0.0, "kl": 0.1824314445257187, "learning_rate": 1.1545978403895159e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12322 }, { "completion_length": 593.25, "epoch": 3.4154656319290466, "grad_norm": 0.4542579650878906, "kl": 24276021248.0, "learning_rate": 1.1542288717927952e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12323 }, { "completion_length": 569.75, "epoch": 3.4157427937915745, "grad_norm": 0.0, "kl": 11974973849600.0, "learning_rate": 1.1538599444652757e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12324 }, { "completion_length": 585.0, "epoch": 3.416019955654102, "grad_norm": 0.0, "kl": 0.17230401933193207, "learning_rate": 1.1534910584182731e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12325 }, { "completion_length": 508.0, "epoch": 3.41629711751663, "grad_norm": 7.442309379577637, "kl": 2.012319171769008e+18, "learning_rate": 1.153122213663098e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12326 }, { "completion_length": 542.25, "epoch": 3.4165742793791574, "grad_norm": 0.0, "kl": 0.19676215946674347, "learning_rate": 1.1527534102110613e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12327 }, { "completion_length": 505.75, "epoch": 3.4168514412416853, "grad_norm": 0.0, "kl": 0.1908053606748581, "learning_rate": 1.1523846480734742e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12328 }, { "completion_length": 554.25, "epoch": 3.4171286031042127, "grad_norm": 0.0, "kl": 0.19510680437088013, "learning_rate": 1.1520159272616432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12329 }, { "completion_length": 493.75, "epoch": 3.4174057649667406, "grad_norm": 16.52907371520996, "kl": 64467685376.0, "learning_rate": 1.1516472477868768e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12330 }, { "completion_length": 565.0, "epoch": 3.417682926829268, "grad_norm": 0.5085606575012207, "kl": 0.17984873056411743, "learning_rate": 1.1512786096604791e-06, "loss": -0.0, "reward": 5.59375, "reward_std": 0.3125, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.375, "rewards/xmlcount_reward_func": 0.71875, "step": 12331 }, { "completion_length": 573.75, "epoch": 3.417960088691796, "grad_norm": 0.0, "kl": 2.132165432390451e+16, "learning_rate": 1.150910012893756e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12332 }, { "completion_length": 622.0, "epoch": 3.4182372505543235, "grad_norm": 0.0, "kl": 0.1943286508321762, "learning_rate": 1.150541457498012e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12333 }, { "completion_length": 596.25, "epoch": 3.4185144124168514, "grad_norm": 0.0, "kl": 0.22688421607017517, "learning_rate": 1.1501729434845478e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12334 }, { "completion_length": 629.75, "epoch": 3.4187915742793793, "grad_norm": 0.0, "kl": 0.20045648515224457, "learning_rate": 1.1498044708646638e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12335 }, { "completion_length": 657.75, "epoch": 3.4190687361419068, "grad_norm": 0.0, "kl": 0.13908134400844574, "learning_rate": 1.1494360396496594e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12336 }, { "completion_length": 494.0, "epoch": 3.4193458980044347, "grad_norm": 0.0, "kl": 0.22008274495601654, "learning_rate": 1.1490676498508344e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12337 }, { "completion_length": 548.5, "epoch": 3.419623059866962, "grad_norm": 0.0, "kl": 0.21632423996925354, "learning_rate": 1.1486993014794848e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12338 }, { "completion_length": 560.5, "epoch": 3.41990022172949, "grad_norm": 0.0, "kl": 0.3445110619068146, "learning_rate": 1.1483309945469053e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12339 }, { "completion_length": 505.5, "epoch": 3.4201773835920175, "grad_norm": 0.0, "kl": 0.9283409714698792, "learning_rate": 1.1479627290643924e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12340 }, { "completion_length": 598.0, "epoch": 3.4204545454545454, "grad_norm": 0.0, "kl": 0.20723339915275574, "learning_rate": 1.1475945050432375e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12341 }, { "completion_length": 489.0, "epoch": 3.4207317073170733, "grad_norm": 0.0, "kl": 0.2693098187446594, "learning_rate": 1.147226322494735e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12342 }, { "completion_length": 507.5, "epoch": 3.421008869179601, "grad_norm": 0.0, "kl": 0.21123209595680237, "learning_rate": 1.1468581814301718e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12343 }, { "completion_length": 582.5, "epoch": 3.4212860310421287, "grad_norm": 0.0, "kl": 0.18505842983722687, "learning_rate": 1.1464900818608405e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12344 }, { "completion_length": 500.25, "epoch": 3.421563192904656, "grad_norm": 2.5216357707977295, "kl": 5.291267767276667e+17, "learning_rate": 1.1461220237980267e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12345 }, { "completion_length": 578.0, "epoch": 3.421840354767184, "grad_norm": 0.0, "kl": 0.17221033573150635, "learning_rate": 1.1457540072530194e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12346 }, { "completion_length": 538.25, "epoch": 3.4221175166297115, "grad_norm": 0.0, "kl": 127674744832.0, "learning_rate": 1.1453860322371032e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12347 }, { "completion_length": 489.5, "epoch": 3.4223946784922394, "grad_norm": 0.0, "kl": 0.17718097567558289, "learning_rate": 1.1450180987615614e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12348 }, { "completion_length": 504.0, "epoch": 3.4226718403547673, "grad_norm": 0.0, "kl": 0.22281129658222198, "learning_rate": 1.1446502068376788e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12349 }, { "completion_length": 660.5, "epoch": 3.422949002217295, "grad_norm": 0.0, "kl": 2.282522201538086, "learning_rate": 1.1442823564767355e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12350 }, { "completion_length": 484.5, "epoch": 3.4232261640798227, "grad_norm": 0.0, "kl": 0.21627956628799438, "learning_rate": 1.1439145476900145e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12351 }, { "completion_length": 632.0, "epoch": 3.42350332594235, "grad_norm": 0.0, "kl": 5.568956300819169e+17, "learning_rate": 1.1435467804887912e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12352 }, { "completion_length": 595.5, "epoch": 3.423780487804878, "grad_norm": 5.7654900550842285, "kl": 1011505758208.0, "learning_rate": 1.1431790548843464e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12353 }, { "completion_length": 543.75, "epoch": 3.4240576496674056, "grad_norm": 0.0, "kl": 0.21200129389762878, "learning_rate": 1.1428113708879548e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12354 }, { "completion_length": 498.75, "epoch": 3.4243348115299335, "grad_norm": 0.0, "kl": 0.23845677077770233, "learning_rate": 1.1424437285108935e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12355 }, { "completion_length": 507.25, "epoch": 3.4246119733924614, "grad_norm": 0.0, "kl": 0.18730244040489197, "learning_rate": 1.142076127764436e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12356 }, { "completion_length": 529.75, "epoch": 3.424889135254989, "grad_norm": 0.0, "kl": 0.2885911762714386, "learning_rate": 1.1417085686598539e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12357 }, { "completion_length": 494.5, "epoch": 3.4251662971175167, "grad_norm": 0.0, "kl": 0.23021386563777924, "learning_rate": 1.1413410512084206e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12358 }, { "completion_length": 556.5, "epoch": 3.425443458980044, "grad_norm": 0.0, "kl": 0.2087354212999344, "learning_rate": 1.1409735754214052e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12359 }, { "completion_length": 592.75, "epoch": 3.425720620842572, "grad_norm": 0.0, "kl": 2686210560.0, "learning_rate": 1.1406061413100768e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12360 }, { "completion_length": 574.25, "epoch": 3.4259977827050996, "grad_norm": 0.0, "kl": 0.18889793753623962, "learning_rate": 1.1402387488857022e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12361 }, { "completion_length": 663.25, "epoch": 3.4262749445676275, "grad_norm": 0.0, "kl": 1.010977864265442, "learning_rate": 1.139871398159549e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12362 }, { "completion_length": 550.0, "epoch": 3.4265521064301554, "grad_norm": 0.0, "kl": 0.19218020141124725, "learning_rate": 1.1395040891428828e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12363 }, { "completion_length": 533.5, "epoch": 3.426829268292683, "grad_norm": 0.0, "kl": 0.19762320816516876, "learning_rate": 1.1391368218469665e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12364 }, { "completion_length": 568.0, "epoch": 3.4271064301552108, "grad_norm": 0.0, "kl": 0.16677704453468323, "learning_rate": 1.1387695962830628e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12365 }, { "completion_length": 559.25, "epoch": 3.4273835920177382, "grad_norm": 0.5551066994667053, "kl": 0.20804187655448914, "learning_rate": 1.1384024124624324e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12366 }, { "completion_length": 596.75, "epoch": 3.427660753880266, "grad_norm": 4.774138450622559, "kl": 1644.5576171875, "learning_rate": 1.1380352703963364e-06, "loss": 0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12367 }, { "completion_length": 624.75, "epoch": 3.4279379157427936, "grad_norm": 0.3581521809101105, "kl": 0.1786721795797348, "learning_rate": 1.1376681700960333e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12368 }, { "completion_length": 570.5, "epoch": 3.4282150776053215, "grad_norm": 0.4160607159137726, "kl": 6.909302921446713e+19, "learning_rate": 1.1373011115727794e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12369 }, { "completion_length": 510.25, "epoch": 3.4284922394678494, "grad_norm": 0.0, "kl": 0.1854543536901474, "learning_rate": 1.1369340948378323e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12370 }, { "completion_length": 495.0, "epoch": 3.428769401330377, "grad_norm": 0.0, "kl": 0.24346861243247986, "learning_rate": 1.1365671199024457e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12371 }, { "completion_length": 543.25, "epoch": 3.429046563192905, "grad_norm": 0.4605807065963745, "kl": 3.3544545054004085e+19, "learning_rate": 1.1362001867778757e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12372 }, { "completion_length": 585.75, "epoch": 3.4293237250554323, "grad_norm": 0.0, "kl": 0.20330703258514404, "learning_rate": 1.1358332954753707e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12373 }, { "completion_length": 487.75, "epoch": 3.42960088691796, "grad_norm": 0.0, "kl": 0.21293970942497253, "learning_rate": 1.1354664460061846e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12374 }, { "completion_length": 447.0, "epoch": 3.4298780487804876, "grad_norm": 0.0, "kl": 0.23867282271385193, "learning_rate": 1.1350996383815654e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12375 }, { "completion_length": 594.25, "epoch": 3.4301552106430155, "grad_norm": 0.0, "kl": 0.17436785995960236, "learning_rate": 1.1347328726127635e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12376 }, { "completion_length": 556.5, "epoch": 3.4304323725055434, "grad_norm": 8.19745922088623, "kl": 9560.0927734375, "learning_rate": 1.134366148711025e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12377 }, { "completion_length": 578.25, "epoch": 3.430709534368071, "grad_norm": 0.0, "kl": 0.20300620794296265, "learning_rate": 1.1339994666875947e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12378 }, { "completion_length": 510.75, "epoch": 3.430986696230599, "grad_norm": 0.0, "kl": 0.39967095851898193, "learning_rate": 1.1336328265537195e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12379 }, { "completion_length": 552.0, "epoch": 3.4312638580931263, "grad_norm": 0.0, "kl": 0.2179611772298813, "learning_rate": 1.1332662283206415e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12380 }, { "completion_length": 479.5, "epoch": 3.431541019955654, "grad_norm": 0.0, "kl": 0.276716023683548, "learning_rate": 1.1328996719996024e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12381 }, { "completion_length": 528.25, "epoch": 3.4318181818181817, "grad_norm": 0.0, "kl": 2.136319619248423e+18, "learning_rate": 1.1325331576018428e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12382 }, { "completion_length": 502.75, "epoch": 3.4320953436807096, "grad_norm": 0.0, "kl": 0.18716681003570557, "learning_rate": 1.1321666851386029e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12383 }, { "completion_length": 575.75, "epoch": 3.4323725055432375, "grad_norm": 0.0, "kl": 0.17596575617790222, "learning_rate": 1.1318002546211216e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12384 }, { "completion_length": 529.5, "epoch": 3.432649667405765, "grad_norm": 0.0, "kl": 1.3058070846206116e+18, "learning_rate": 1.1314338660606348e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12385 }, { "completion_length": 463.25, "epoch": 3.432926829268293, "grad_norm": 0.0, "kl": 0.42038893699645996, "learning_rate": 1.131067519468378e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12386 }, { "completion_length": 525.0, "epoch": 3.4332039911308203, "grad_norm": 0.0, "kl": 0.17748023569583893, "learning_rate": 1.1307012148555852e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12387 }, { "completion_length": 552.0, "epoch": 3.433481152993348, "grad_norm": 0.0, "kl": 0.20611299574375153, "learning_rate": 1.1303349522334905e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12388 }, { "completion_length": 576.75, "epoch": 3.4337583148558757, "grad_norm": 5.854963779449463, "kl": 72315265024.0, "learning_rate": 1.1299687316133257e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12389 }, { "completion_length": 534.25, "epoch": 3.4340354767184036, "grad_norm": 0.0, "kl": 0.21596118807792664, "learning_rate": 1.1296025530063203e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12390 }, { "completion_length": 507.0, "epoch": 3.4343126385809315, "grad_norm": 0.0, "kl": 0.19321076571941376, "learning_rate": 1.1292364164237033e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12391 }, { "completion_length": 537.75, "epoch": 3.434589800443459, "grad_norm": 0.0, "kl": 0.17818492650985718, "learning_rate": 1.1288703218767027e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12392 }, { "completion_length": 527.0, "epoch": 3.434866962305987, "grad_norm": 0.39806950092315674, "kl": 42452381696.0, "learning_rate": 1.1285042693765477e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12393 }, { "completion_length": 562.0, "epoch": 3.4351441241685143, "grad_norm": 0.0, "kl": 0.1931980550289154, "learning_rate": 1.1281382589344598e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12394 }, { "completion_length": 595.75, "epoch": 3.4354212860310422, "grad_norm": 0.0, "kl": 0.16428391635417938, "learning_rate": 1.1277722905616653e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12395 }, { "completion_length": 600.25, "epoch": 3.4356984478935697, "grad_norm": 0.0, "kl": 250427312.0, "learning_rate": 1.1274063642693855e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12396 }, { "completion_length": 496.0, "epoch": 3.4359756097560976, "grad_norm": 0.0, "kl": 0.19716963171958923, "learning_rate": 1.127040480068844e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12397 }, { "completion_length": 511.75, "epoch": 3.4362527716186255, "grad_norm": 0.0, "kl": 0.1691228300333023, "learning_rate": 1.126674637971259e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12398 }, { "completion_length": 629.75, "epoch": 3.436529933481153, "grad_norm": 0.4490731358528137, "kl": 9560436736.0, "learning_rate": 1.1263088379878493e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12399 }, { "completion_length": 581.0, "epoch": 3.436807095343681, "grad_norm": 0.0, "kl": 0.17473942041397095, "learning_rate": 1.1259430801298341e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12400 }, { "completion_length": 556.75, "epoch": 3.4370842572062084, "grad_norm": 0.0, "kl": 0.19995085895061493, "learning_rate": 1.1255773644084276e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12401 }, { "completion_length": 590.25, "epoch": 3.4373614190687363, "grad_norm": 0.0, "kl": 29.51964569091797, "learning_rate": 1.1252116908348478e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12402 }, { "completion_length": 518.75, "epoch": 3.4376385809312637, "grad_norm": 0.0, "kl": 0.2225653976202011, "learning_rate": 1.1248460594203049e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12403 }, { "completion_length": 608.5, "epoch": 3.4379157427937916, "grad_norm": 0.0, "kl": 0.18999145925045013, "learning_rate": 1.1244804701760133e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12404 }, { "completion_length": 542.5, "epoch": 3.438192904656319, "grad_norm": 0.0, "kl": 0.31211572885513306, "learning_rate": 1.124114923113183e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12405 }, { "completion_length": 536.0, "epoch": 3.438470066518847, "grad_norm": 0.0, "kl": 0.19627811014652252, "learning_rate": 1.1237494182430252e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12406 }, { "completion_length": 545.5, "epoch": 3.4387472283813745, "grad_norm": 0.0, "kl": 0.20313575863838196, "learning_rate": 1.1233839555767482e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12407 }, { "completion_length": 464.75, "epoch": 3.4390243902439024, "grad_norm": 0.0, "kl": 0.19355715811252594, "learning_rate": 1.1230185351255573e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12408 }, { "completion_length": 510.75, "epoch": 3.4393015521064303, "grad_norm": 0.0, "kl": 0.2257712334394455, "learning_rate": 1.1226531569006607e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12409 }, { "completion_length": 576.0, "epoch": 3.4395787139689578, "grad_norm": 0.0, "kl": 0.18464413285255432, "learning_rate": 1.1222878209132624e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12410 }, { "completion_length": 637.5, "epoch": 3.4398558758314857, "grad_norm": 0.0, "kl": 0.14751006662845612, "learning_rate": 1.1219225271745653e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12411 }, { "completion_length": 586.75, "epoch": 3.440133037694013, "grad_norm": 0.0, "kl": 1.5584490181602836e+18, "learning_rate": 1.121557275695771e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12412 }, { "completion_length": 610.75, "epoch": 3.440410199556541, "grad_norm": 0.0, "kl": 0.2070726901292801, "learning_rate": 1.1211920664880807e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12413 }, { "completion_length": 526.5, "epoch": 3.4406873614190685, "grad_norm": 2.101452350616455, "kl": 1507912843264.0, "learning_rate": 1.120826899562695e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12414 }, { "completion_length": 426.5, "epoch": 3.4409645232815964, "grad_norm": 0.0, "kl": 51.92955780029297, "learning_rate": 1.1204617749308108e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12415 }, { "completion_length": 492.75, "epoch": 3.4412416851441243, "grad_norm": 0.0, "kl": 3.344397008658351e+20, "learning_rate": 1.1200966926036255e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12416 }, { "completion_length": 561.5, "epoch": 3.441518847006652, "grad_norm": 0.0, "kl": 0.21646705269813538, "learning_rate": 1.1197316525923334e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12417 }, { "completion_length": 535.75, "epoch": 3.4417960088691797, "grad_norm": 0.0, "kl": 0.17828990519046783, "learning_rate": 1.119366654908131e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12418 }, { "completion_length": 575.5, "epoch": 3.442073170731707, "grad_norm": 0.0, "kl": 0.19772329926490784, "learning_rate": 1.1190016995622094e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12419 }, { "completion_length": 522.5, "epoch": 3.442350332594235, "grad_norm": 0.0, "kl": 211305988096.0, "learning_rate": 1.118636786565761e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12420 }, { "completion_length": 464.5, "epoch": 3.4426274944567625, "grad_norm": 0.0, "kl": 0.20860536396503448, "learning_rate": 1.1182719159299755e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12421 }, { "completion_length": 488.25, "epoch": 3.4429046563192904, "grad_norm": 0.0, "kl": 0.20308686792850494, "learning_rate": 1.1179070876660422e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12422 }, { "completion_length": 511.75, "epoch": 3.4431818181818183, "grad_norm": 0.0, "kl": 0.23700031638145447, "learning_rate": 1.1175423017851508e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12423 }, { "completion_length": 641.25, "epoch": 3.443458980044346, "grad_norm": 0.0, "kl": 1.809706975798231e+17, "learning_rate": 1.1171775582984849e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12424 }, { "completion_length": 560.0, "epoch": 3.4437361419068737, "grad_norm": 0.0, "kl": 0.18013961613178253, "learning_rate": 1.1168128572172313e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12425 }, { "completion_length": 572.0, "epoch": 3.444013303769401, "grad_norm": 0.0, "kl": 0.18068674206733704, "learning_rate": 1.1164481985525725e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12426 }, { "completion_length": 579.25, "epoch": 3.444290465631929, "grad_norm": 0.0, "kl": 0.1925843507051468, "learning_rate": 1.1160835823156931e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12427 }, { "completion_length": 537.5, "epoch": 3.4445676274944566, "grad_norm": 0.0, "kl": 0.22539809346199036, "learning_rate": 1.1157190085177733e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12428 }, { "completion_length": 546.75, "epoch": 3.4448447893569845, "grad_norm": 0.0, "kl": 0.21064305305480957, "learning_rate": 1.115354477169992e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12429 }, { "completion_length": 508.0, "epoch": 3.4451219512195124, "grad_norm": 0.0, "kl": 0.2855033278465271, "learning_rate": 1.11498998828353e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12430 }, { "completion_length": 568.75, "epoch": 3.44539911308204, "grad_norm": 0.0, "kl": 0.19703640043735504, "learning_rate": 1.1146255418695635e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12431 }, { "completion_length": 568.25, "epoch": 3.4456762749445677, "grad_norm": 0.4003332853317261, "kl": 16273937408.0, "learning_rate": 1.1142611379392687e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12432 }, { "completion_length": 507.75, "epoch": 3.445953436807095, "grad_norm": 0.0, "kl": 0.23327769339084625, "learning_rate": 1.1138967765038192e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12433 }, { "completion_length": 439.5, "epoch": 3.446230598669623, "grad_norm": 0.0, "kl": 0.21685753762722015, "learning_rate": 1.1135324575743905e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12434 }, { "completion_length": 571.5, "epoch": 3.4465077605321506, "grad_norm": 0.0, "kl": 0.17762324213981628, "learning_rate": 1.1131681811621529e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12435 }, { "completion_length": 472.0, "epoch": 3.4467849223946785, "grad_norm": 0.0, "kl": 0.2448539286851883, "learning_rate": 1.1128039472782789e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12436 }, { "completion_length": 545.0, "epoch": 3.4470620842572064, "grad_norm": 0.0, "kl": 0.2257719337940216, "learning_rate": 1.1124397559339373e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12437 }, { "completion_length": 498.75, "epoch": 3.447339246119734, "grad_norm": 0.0, "kl": 0.19003964960575104, "learning_rate": 1.1120756071402952e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12438 }, { "completion_length": 564.25, "epoch": 3.4476164079822618, "grad_norm": 0.0, "kl": 201556.234375, "learning_rate": 1.1117115009085217e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12439 }, { "completion_length": 534.25, "epoch": 3.4478935698447892, "grad_norm": 0.0, "kl": 0.2295328676700592, "learning_rate": 1.1113474372497814e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12440 }, { "completion_length": 611.5, "epoch": 3.448170731707317, "grad_norm": 0.0, "kl": 0.17777608335018158, "learning_rate": 1.1109834161752384e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12441 }, { "completion_length": 551.5, "epoch": 3.4484478935698446, "grad_norm": 0.0, "kl": 0.2176077663898468, "learning_rate": 1.110619437696055e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12442 }, { "completion_length": 547.25, "epoch": 3.4487250554323725, "grad_norm": 0.0, "kl": 0.23944808542728424, "learning_rate": 1.1102555018233938e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12443 }, { "completion_length": 591.75, "epoch": 3.4490022172949004, "grad_norm": 0.0, "kl": 0.15457111597061157, "learning_rate": 1.109891608568416e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12444 }, { "completion_length": 537.75, "epoch": 3.449279379157428, "grad_norm": 0.0, "kl": 0.17593878507614136, "learning_rate": 1.10952775794228e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12445 }, { "completion_length": 536.0, "epoch": 3.449556541019956, "grad_norm": 0.0, "kl": 0.21185868978500366, "learning_rate": 1.1091639499561432e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12446 }, { "completion_length": 501.5, "epoch": 3.4498337028824833, "grad_norm": 0.0, "kl": 0.18316657841205597, "learning_rate": 1.1088001846211616e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12447 }, { "completion_length": 498.25, "epoch": 3.450110864745011, "grad_norm": 0.8110710978507996, "kl": 34433159168.0, "learning_rate": 1.1084364619484921e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12448 }, { "completion_length": 549.25, "epoch": 3.4503880266075386, "grad_norm": 0.0, "kl": 0.22633251547813416, "learning_rate": 1.1080727819492873e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12449 }, { "completion_length": 494.0, "epoch": 3.4506651884700665, "grad_norm": 0.0, "kl": 0.21430270373821259, "learning_rate": 1.1077091446347001e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12450 }, { "completion_length": 513.75, "epoch": 3.4509423503325944, "grad_norm": 0.0, "kl": 0.19701987504959106, "learning_rate": 1.1073455500158808e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12451 }, { "completion_length": 525.5, "epoch": 3.451219512195122, "grad_norm": 1.358420729637146, "kl": 647916160.0, "learning_rate": 1.1069819981039804e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12452 }, { "completion_length": 527.0, "epoch": 3.45149667405765, "grad_norm": 0.0, "kl": 0.34434497356414795, "learning_rate": 1.1066184889101486e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12453 }, { "completion_length": 520.5, "epoch": 3.4517738359201773, "grad_norm": 0.0, "kl": 0.21052557229995728, "learning_rate": 1.1062550224455298e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12454 }, { "completion_length": 519.5, "epoch": 3.452050997782705, "grad_norm": 0.0, "kl": 0.17363667488098145, "learning_rate": 1.1058915987212728e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12455 }, { "completion_length": 483.0, "epoch": 3.4523281596452327, "grad_norm": 0.0, "kl": 0.21810409426689148, "learning_rate": 1.1055282177485199e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12456 }, { "completion_length": 524.25, "epoch": 3.4526053215077606, "grad_norm": 0.0, "kl": 4767546368.0, "learning_rate": 1.1051648795384167e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12457 }, { "completion_length": 547.0, "epoch": 3.4528824833702885, "grad_norm": 2.780212640762329, "kl": 5.85387852136081e+18, "learning_rate": 1.1048015841021045e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12458 }, { "completion_length": 565.5, "epoch": 3.453159645232816, "grad_norm": 0.0, "kl": 0.19824957847595215, "learning_rate": 1.1044383314507225e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12459 }, { "completion_length": 550.75, "epoch": 3.453436807095344, "grad_norm": 0.0, "kl": 0.16407088935375214, "learning_rate": 1.1040751215954127e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12460 }, { "completion_length": 569.25, "epoch": 3.4537139689578713, "grad_norm": 0.0, "kl": 0.17290915548801422, "learning_rate": 1.1037119545473121e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12461 }, { "completion_length": 529.0, "epoch": 3.453991130820399, "grad_norm": 0.0, "kl": 0.19475165009498596, "learning_rate": 1.103348830317557e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12462 }, { "completion_length": 521.25, "epoch": 3.4542682926829267, "grad_norm": 0.0, "kl": 0.2136947214603424, "learning_rate": 1.102985748917283e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12463 }, { "completion_length": 541.5, "epoch": 3.4545454545454546, "grad_norm": 0.0, "kl": 0.17505541443824768, "learning_rate": 1.102622710357625e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12464 }, { "completion_length": 608.75, "epoch": 3.4548226164079825, "grad_norm": 0.43641841411590576, "kl": 65336008704.0, "learning_rate": 1.102259714649715e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12465 }, { "completion_length": 544.5, "epoch": 3.45509977827051, "grad_norm": 0.0, "kl": 0.22515158355236053, "learning_rate": 1.1018967618046857e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12466 }, { "completion_length": 505.5, "epoch": 3.455376940133038, "grad_norm": 0.0, "kl": 0.20659062266349792, "learning_rate": 1.1015338518336672e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12467 }, { "completion_length": 546.0, "epoch": 3.4556541019955653, "grad_norm": 0.43771684169769287, "kl": 118215090176.0, "learning_rate": 1.1011709847477867e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12468 }, { "completion_length": 607.0, "epoch": 3.4559312638580932, "grad_norm": 0.0, "kl": 0.15531311929225922, "learning_rate": 1.100808160558174e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12469 }, { "completion_length": 534.75, "epoch": 3.4562084257206207, "grad_norm": 0.0, "kl": 0.16752420365810394, "learning_rate": 1.1004453792759547e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12470 }, { "completion_length": 487.5, "epoch": 3.4564855875831486, "grad_norm": 0.5119827389717102, "kl": 44015730688.0, "learning_rate": 1.1000826409122538e-06, "loss": 0.0, "reward": 4.75, "reward_std": 2.0, "rewards/confident_score_func": 1.5, "rewards/correctness_rewards": 1.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12471 }, { "completion_length": 551.0, "epoch": 3.4567627494456765, "grad_norm": 0.0, "kl": 0.1765529066324234, "learning_rate": 1.0997199454781938e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12472 }, { "completion_length": 623.25, "epoch": 3.457039911308204, "grad_norm": 0.0, "kl": 0.19010521471500397, "learning_rate": 1.0993572929848978e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12473 }, { "completion_length": 617.25, "epoch": 3.457317073170732, "grad_norm": 0.0, "kl": 0.18829220533370972, "learning_rate": 1.098994683443489e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12474 }, { "completion_length": 569.0, "epoch": 3.4575942350332594, "grad_norm": 0.43328019976615906, "kl": 0.21179406344890594, "learning_rate": 1.0986321168650834e-06, "loss": -0.0, "reward": 2.75, "reward_std": 2.0, "rewards/confident_score_func": 0.5, "rewards/correctness_rewards": 0.5, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12475 }, { "completion_length": 534.25, "epoch": 3.4578713968957873, "grad_norm": 0.0, "kl": 0.15537279844284058, "learning_rate": 1.0982695932608024e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12476 }, { "completion_length": 529.5, "epoch": 3.4581485587583147, "grad_norm": 0.0, "kl": 0.20272180438041687, "learning_rate": 1.0979071126417607e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12477 }, { "completion_length": 580.25, "epoch": 3.4584257206208426, "grad_norm": 0.0, "kl": 0.17315079271793365, "learning_rate": 1.0975446750190763e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12478 }, { "completion_length": 610.75, "epoch": 3.45870288248337, "grad_norm": 0.0, "kl": 0.17571058869361877, "learning_rate": 1.0971822804038625e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12479 }, { "completion_length": 495.75, "epoch": 3.458980044345898, "grad_norm": 0.0, "kl": 0.2617981433868408, "learning_rate": 1.0968199288072318e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12480 }, { "completion_length": 553.5, "epoch": 3.4592572062084255, "grad_norm": 0.0, "kl": 0.21691352128982544, "learning_rate": 1.096457620240298e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12481 }, { "completion_length": 465.5, "epoch": 3.4595343680709534, "grad_norm": 0.0, "kl": 0.18699152767658234, "learning_rate": 1.09609535471417e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12482 }, { "completion_length": 529.0, "epoch": 3.4598115299334813, "grad_norm": 0.0, "kl": 0.21380388736724854, "learning_rate": 1.0957331322399575e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12483 }, { "completion_length": 481.75, "epoch": 3.4600886917960088, "grad_norm": 0.0, "kl": 0.22602307796478271, "learning_rate": 1.0953709528287673e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12484 }, { "completion_length": 576.0, "epoch": 3.4603658536585367, "grad_norm": 0.0, "kl": 0.18120639026165009, "learning_rate": 1.0950088164917078e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12485 }, { "completion_length": 490.25, "epoch": 3.460643015521064, "grad_norm": 0.0, "kl": 0.20877587795257568, "learning_rate": 1.0946467232398821e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12486 }, { "completion_length": 520.75, "epoch": 3.460920177383592, "grad_norm": 0.0, "kl": 0.17437753081321716, "learning_rate": 1.0942846730843976e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12487 }, { "completion_length": 545.5, "epoch": 3.4611973392461195, "grad_norm": 0.0, "kl": 0.20732329785823822, "learning_rate": 1.0939226660363524e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12488 }, { "completion_length": 552.75, "epoch": 3.4614745011086474, "grad_norm": 0.0, "kl": 1.1087790814330356e+19, "learning_rate": 1.09356070210685e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12489 }, { "completion_length": 607.0, "epoch": 3.4617516629711753, "grad_norm": 0.0, "kl": 3.6895095358541005e+17, "learning_rate": 1.093198781306991e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12490 }, { "completion_length": 518.0, "epoch": 3.462028824833703, "grad_norm": 3.2331199645996094, "kl": 4570226176.0, "learning_rate": 1.0928369036478733e-06, "loss": -0.0, "reward": 3.75, "reward_std": 2.309401035308838, "rewards/confident_score_func": 1.0, "rewards/correctness_rewards": 1.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12491 }, { "completion_length": 538.0, "epoch": 3.4623059866962307, "grad_norm": 0.0, "kl": 0.19286443293094635, "learning_rate": 1.0924750691405941e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12492 }, { "completion_length": 628.75, "epoch": 3.462583148558758, "grad_norm": 0.0, "kl": 2.916958993635082e+18, "learning_rate": 1.0921132777962487e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12493 }, { "completion_length": 544.25, "epoch": 3.462860310421286, "grad_norm": 0.0, "kl": 0.1800776720046997, "learning_rate": 1.0917515296259331e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12494 }, { "completion_length": 510.5, "epoch": 3.4631374722838135, "grad_norm": 0.0, "kl": 9.710288837070029e+17, "learning_rate": 1.091389824640739e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12495 }, { "completion_length": 513.0, "epoch": 3.4634146341463414, "grad_norm": 0.0, "kl": 0.20403392612934113, "learning_rate": 1.0910281628517601e-06, "loss": 0.0, "reward": 1.75, "reward_std": 0.0, "rewards/confident_score_func": 0.0, "rewards/correctness_rewards": 0.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12496 }, { "completion_length": 493.0, "epoch": 3.4636917960088693, "grad_norm": 0.0, "kl": 0.20358742773532867, "learning_rate": 1.0906665442700868e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12497 }, { "completion_length": 475.0, "epoch": 3.463968957871397, "grad_norm": 0.0, "kl": 0.19362422823905945, "learning_rate": 1.0903049689068066e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12498 }, { "completion_length": 521.25, "epoch": 3.4642461197339247, "grad_norm": 0.0, "kl": 0.19796176254749298, "learning_rate": 1.0899434367730096e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12499 }, { "completion_length": 536.0, "epoch": 3.464523281596452, "grad_norm": 0.0, "kl": 0.19768044352531433, "learning_rate": 1.0895819478797817e-06, "loss": 0.0, "reward": 5.75, "reward_std": 0.0, "rewards/confident_score_func": 2.0, "rewards/correctness_rewards": 2.0, "rewards/int_reward_func": 0.5, "rewards/soft_format_reward_func": 0.5, "rewards/xmlcount_reward_func": 0.75, "step": 12500 } ], "logging_steps": 1, "max_steps": 18040, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }