{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4, "eval_steps": 500, "global_step": 1800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 309.04, "epoch": 0.011111111111111112, "grad_norm": NaN, "kl": 222.66988746643065, "learning_rate": 5.444444444444444e-07, "loss": 8.9068, "reward": -18.06266725540161, "reward_std": 6.391496688127518, "rewards/check_first_pass": -9.93666666984558, "rewards/check_solution": -7.600000243186951, "rewards/check_solution_words": -6.068000079095364, "rewards/check_word_guesses": 5.54200014591217, "step": 50 }, { "completion_length": 368.64, "epoch": 0.022222222222222223, "grad_norm": NaN, "kl": 557.3866543316841, "learning_rate": 1.1e-06, "loss": 22.2955, "reward": -17.431167125701904, "reward_std": 5.4497878611087796, "rewards/check_first_pass": -9.859833374023438, "rewards/check_solution": -7.2583335638046265, "rewards/check_solution_words": -5.878333521187305, "rewards/check_word_guesses": 5.565333509445191, "step": 100 }, { "completion_length": 346.92, "epoch": 0.03333333333333333, "grad_norm": NaN, "kl": 4737.8455329227445, "learning_rate": 1.6555555555555559e-06, "loss": 189.5138, "reward": -18.070500688552855, "reward_std": 7.8515861177444455, "rewards/check_first_pass": -9.786166725158692, "rewards/check_solution": -7.325000324249268, "rewards/check_solution_words": -7.050333592891693, "rewards/check_word_guesses": 6.091000156402588, "step": 150 }, { "completion_length": 322.2, "epoch": 0.044444444444444446, "grad_norm": NaN, "kl": 32057.38775477886, "learning_rate": 2.2111111111111113e-06, "loss": 1282.2956, "reward": -15.816333751678467, "reward_std": 6.191992573738098, "rewards/check_first_pass": -9.895000038146973, "rewards/check_solution": -7.100000200271606, "rewards/check_solution_words": -4.8800000631809235, "rewards/check_word_guesses": 6.058666839599609, "step": 200 }, { "completion_length": 349.9, "epoch": 0.05555555555555555, "grad_norm": NaN, "kl": 5074.338300862312, "learning_rate": 2.766666666666667e-06, "loss": 202.9736, "reward": -17.724167308807374, "reward_std": 6.207637655735016, "rewards/check_first_pass": -9.912833366394043, "rewards/check_solution": -7.358333556652069, "rewards/check_solution_words": -6.180666843354702, "rewards/check_word_guesses": 5.727666816711426, "step": 250 }, { "completion_length": 336.42, "epoch": 0.06666666666666667, "grad_norm": NaN, "kl": 315.6221669435501, "learning_rate": 3.322222222222222e-06, "loss": 12.6249, "reward": -16.775000438690185, "reward_std": 5.353409328460693, "rewards/check_first_pass": -9.81633337020874, "rewards/check_solution": -7.341666927337647, "rewards/check_solution_words": -5.623000101844471, "rewards/check_word_guesses": 6.006000165939331, "step": 300 }, { "completion_length": 307.04, "epoch": 0.07777777777777778, "grad_norm": NaN, "kl": 6570.5719665384295, "learning_rate": 3.877777777777778e-06, "loss": 262.8229, "reward": -17.077000389099123, "reward_std": 5.669408960938454, "rewards/check_first_pass": -9.886666717529296, "rewards/check_solution": -7.250000200271606, "rewards/check_solution_words": -5.695666807889938, "rewards/check_word_guesses": 5.755333452224732, "step": 350 }, { "completion_length": 313.08, "epoch": 0.08888888888888889, "grad_norm": NaN, "kl": 1532.4928638124466, "learning_rate": 4.433333333333334e-06, "loss": 61.2997, "reward": -17.507167091369627, "reward_std": 5.527194731235504, "rewards/check_first_pass": -9.908166694641114, "rewards/check_solution": -7.30833353638649, "rewards/check_solution_words": -6.251000165343284, "rewards/check_word_guesses": 5.9603334903717045, "step": 400 }, { "completion_length": 329.37666687011716, "epoch": 0.1, "grad_norm": NaN, "kl": 1601.70994805336, "learning_rate": 4.988888888888889e-06, "loss": 64.0684, "reward": -17.980167026519776, "reward_std": 6.458992264270782, "rewards/check_first_pass": -9.801500053405762, "rewards/check_solution": -7.2666668963432315, "rewards/check_solution_words": -6.554666934013366, "rewards/check_word_guesses": 5.64266683101654, "step": 450 }, { "completion_length": 307.52, "epoch": 0.1111111111111111, "grad_norm": NaN, "kl": 702.5347912788391, "learning_rate": 4.998194324998843e-06, "loss": 28.1014, "reward": -16.74250042915344, "reward_std": 6.445133271217347, "rewards/check_first_pass": -9.824500045776368, "rewards/check_solution": -7.308333573341369, "rewards/check_solution_words": -5.524333542585373, "rewards/check_word_guesses": 5.914666795730591, "step": 500 }, { "completion_length": 335.9, "epoch": 0.12222222222222222, "grad_norm": NaN, "kl": 19601.83191286087, "learning_rate": 4.992631880567301e-06, "loss": 784.0733, "reward": -17.86000030517578, "reward_std": 7.05341215133667, "rewards/check_first_pass": -9.785000047683717, "rewards/check_solution": -7.49166690826416, "rewards/check_solution_words": -6.301333554983139, "rewards/check_word_guesses": 5.71800015449524, "step": 550 }, { "completion_length": 298.2, "epoch": 0.13333333333333333, "grad_norm": NaN, "kl": 1115.117756202221, "learning_rate": 4.983320281008445e-06, "loss": 44.6047, "reward": -16.99700037956238, "reward_std": 5.631768324375153, "rewards/check_first_pass": -9.813000040054321, "rewards/check_solution": -7.041666898727417, "rewards/check_solution_words": -6.250666889995337, "rewards/check_word_guesses": 6.108333473205566, "step": 600 }, { "completion_length": 318.48, "epoch": 0.14444444444444443, "grad_norm": NaN, "kl": 3946.661036362648, "learning_rate": 4.970273531852536e-06, "loss": 157.8665, "reward": -17.999333934783934, "reward_std": 6.210418889522552, "rewards/check_first_pass": -9.89133337020874, "rewards/check_solution": -7.458333578109741, "rewards/check_solution_words": -6.459333531856537, "rewards/check_word_guesses": 5.809666805267334, "step": 650 }, { "completion_length": 351.9, "epoch": 0.15555555555555556, "grad_norm": NaN, "kl": 2870.44579018116, "learning_rate": 4.953511256649632e-06, "loss": 114.8178, "reward": -17.553834075927735, "reward_std": 5.835132333040238, "rewards/check_first_pass": -9.929833374023438, "rewards/check_solution": -7.383333520889282, "rewards/check_solution_words": -6.055666868388653, "rewards/check_word_guesses": 5.815000147819519, "step": 700 }, { "completion_length": 308.34, "epoch": 0.16666666666666666, "grad_norm": NaN, "kl": 164.13174985408784, "learning_rate": 4.933058667453916e-06, "loss": 6.5653, "reward": -16.56966731071472, "reward_std": 6.621588716208935, "rewards/check_first_pass": -9.908333358764649, "rewards/check_solution": -7.291666874885559, "rewards/check_solution_words": -5.485666743516922, "rewards/check_word_guesses": 6.116000127792359, "step": 750 }, { "completion_length": 342.34, "epoch": 0.17777777777777778, "grad_norm": NaN, "kl": 1447.0631847190857, "learning_rate": 4.9089465269023596e-06, "loss": 57.8825, "reward": -17.248333780765535, "reward_std": 6.114709348678589, "rewards/check_first_pass": -9.830000019073486, "rewards/check_solution": -7.2333335685729985, "rewards/check_solution_words": -6.300666825771332, "rewards/check_word_guesses": 6.115666842460632, "step": 800 }, { "completion_length": 354.18, "epoch": 0.18888888888888888, "grad_norm": NaN, "kl": 23526.59426044941, "learning_rate": 4.881211101944802e-06, "loss": 941.0638, "reward": -17.54183391571045, "reward_std": 6.4859533834457395, "rewards/check_first_pass": -9.808833379745483, "rewards/check_solution": -7.708333535194397, "rewards/check_solution_words": -5.9636668264865875, "rewards/check_word_guesses": 5.939000129699707, "step": 850 }, { "completion_length": 308.18, "epoch": 0.2, "grad_norm": NaN, "kl": 138.43031896591185, "learning_rate": 4.84989410929501e-06, "loss": 5.5372, "reward": -17.896833839416505, "reward_std": 5.668911509513855, "rewards/check_first_pass": -9.863166694641114, "rewards/check_solution": -7.233333587646484, "rewards/check_solution_words": -6.624666909873485, "rewards/check_word_guesses": 5.824333515167236, "step": 900 }, { "completion_length": 314.82, "epoch": 0.2111111111111111, "grad_norm": NaN, "kl": 1218.171366314888, "learning_rate": 4.815042652684779e-06, "loss": 48.7269, "reward": -16.533334035873413, "reward_std": 7.360376672744751, "rewards/check_first_pass": -9.612000093460082, "rewards/check_solution": -7.158333578109741, "rewards/check_solution_words": -5.995000202357769, "rewards/check_word_guesses": 6.232000198364258, "step": 950 }, { "completion_length": 339.66, "epoch": 0.2222222222222222, "grad_norm": NaN, "kl": 174.28998464107514, "learning_rate": 4.776709152015443e-06, "loss": 6.9716, "reward": -17.22483383178711, "reward_std": 6.013938563764095, "rewards/check_first_pass": -9.816166725158691, "rewards/check_solution": -7.008333616256714, "rewards/check_solution_words": -6.318000204563141, "rewards/check_word_guesses": 5.9176667785644534, "step": 1000 }, { "completion_length": 299.26, "epoch": 0.23333333333333334, "grad_norm": NaN, "kl": 240.47271874427796, "learning_rate": 4.734951264513368e-06, "loss": 9.6189, "reward": -16.81516722679138, "reward_std": 5.74999471783638, "rewards/check_first_pass": -9.819500045776367, "rewards/check_solution": -7.191666922569275, "rewards/check_solution_words": -5.3703335279226305, "rewards/check_word_guesses": 5.566333475112915, "step": 1050 }, { "completion_length": 290.3, "epoch": 0.24444444444444444, "grad_norm": NaN, "kl": 5930.2736493730545, "learning_rate": 4.689831798008002e-06, "loss": 237.2109, "reward": -16.739000663757324, "reward_std": 6.145890753269196, "rewards/check_first_pass": -9.928000049591065, "rewards/check_solution": -7.291666860580444, "rewards/check_solution_words": -5.206000205874443, "rewards/check_word_guesses": 5.686666803359985, "step": 1100 }, { "completion_length": 313.16, "epoch": 0.25555555555555554, "grad_norm": NaN, "kl": 790.292287569046, "learning_rate": 4.641418616462938e-06, "loss": 31.6117, "reward": -18.431500701904298, "reward_std": 5.68714599609375, "rewards/check_first_pass": -9.87116668701172, "rewards/check_solution": -7.8250002098083495, "rewards/check_solution_words": -6.450666869878769, "rewards/check_word_guesses": 5.7153334808349605, "step": 1150 }, { "completion_length": 288.62, "epoch": 0.26666666666666666, "grad_norm": NaN, "kl": 299.9416353178024, "learning_rate": 4.589784537902062e-06, "loss": 11.9977, "reward": -17.612167091369628, "reward_std": 5.032542688846588, "rewards/check_first_pass": -9.781500082015992, "rewards/check_solution": -7.425000247955322, "rewards/check_solution_words": -6.234666793346405, "rewards/check_word_guesses": 5.829000115394592, "step": 1200 }, { "completion_length": 322.98, "epoch": 0.2777777777777778, "grad_norm": NaN, "kl": 2793.8524017858504, "learning_rate": 4.53500722488433e-06, "loss": 111.7541, "reward": -17.664333724975585, "reward_std": 5.747455310821533, "rewards/check_first_pass": -9.923000030517578, "rewards/check_solution": -7.4000002384185795, "rewards/check_solution_words": -6.01900016926229, "rewards/check_word_guesses": 5.6776668119430544, "step": 1250 }, { "completion_length": 339.3, "epoch": 0.28888888888888886, "grad_norm": NaN, "kl": 3813.7429452037813, "learning_rate": 4.477169067691902e-06, "loss": 152.5497, "reward": -17.690833921432496, "reward_std": 5.353043854236603, "rewards/check_first_pass": -9.892833366394044, "rewards/check_solution": -7.600000247955323, "rewards/check_solution_words": -5.904333523511887, "rewards/check_word_guesses": 5.706333441734314, "step": 1300 }, { "completion_length": 310.96, "epoch": 0.3, "grad_norm": NaN, "kl": 304.47424302577974, "learning_rate": 4.416357060407332e-06, "loss": 12.179, "reward": -17.26916711807251, "reward_std": 5.263389755487442, "rewards/check_first_pass": -9.854500017166139, "rewards/check_solution": -7.250000243186951, "rewards/check_solution_words": -5.872333557605743, "rewards/check_word_guesses": 5.707666797637939, "step": 1350 }, { "completion_length": 315.5, "epoch": 0.3111111111111111, "grad_norm": NaN, "kl": 13639.832137713433, "learning_rate": 4.3526626700662e-06, "loss": 545.5934, "reward": -18.18800064086914, "reward_std": 6.097169952392578, "rewards/check_first_pass": -9.844333381652833, "rewards/check_solution": -7.566666946411133, "rewards/check_solution_words": -6.689333482980728, "rewards/check_word_guesses": 5.91233346939087, "step": 1400 }, { "completion_length": 351.12, "epoch": 0.32222222222222224, "grad_norm": NaN, "kl": 2326.0249900770186, "learning_rate": 4.286181699082008e-06, "loss": 93.041, "reward": -18.623000659942626, "reward_std": 6.812479295730591, "rewards/check_first_pass": -9.90633337020874, "rewards/check_solution": -7.350000295639038, "rewards/check_solution_words": -7.338333506584167, "rewards/check_word_guesses": 5.971666851043701, "step": 1450 }, { "completion_length": 366.32, "epoch": 0.3333333333333333, "grad_norm": NaN, "kl": 16549.004248199464, "learning_rate": 4.217014141150248e-06, "loss": 661.9602, "reward": -18.263500604629517, "reward_std": 6.059882239103318, "rewards/check_first_pass": -9.878166694641113, "rewards/check_solution": -7.3666668796539305, "rewards/check_solution_words": -6.7500001257658, "rewards/check_word_guesses": 5.731333417892456, "step": 1500 }, { "completion_length": 320.5, "epoch": 0.34444444444444444, "grad_norm": NaN, "kl": 8347.091685709953, "learning_rate": 4.145264030848381e-06, "loss": 333.8836, "reward": -17.722667150497436, "reward_std": 5.240287501811981, "rewards/check_first_pass": -9.976666679382324, "rewards/check_solution": -7.566666932106018, "rewards/check_solution_words": -5.952333456873894, "rewards/check_word_guesses": 5.773000164031982, "step": 1550 }, { "completion_length": 313.42, "epoch": 0.35555555555555557, "grad_norm": NaN, "kl": 864.0743899011612, "learning_rate": 4.071039287157953e-06, "loss": 34.563, "reward": -17.983000602722168, "reward_std": 5.850111997127533, "rewards/check_first_pass": -9.918333358764649, "rewards/check_solution": -7.208333625793457, "rewards/check_solution_words": -6.453333538174629, "rewards/check_word_guesses": 5.597000193595886, "step": 1600 }, { "completion_length": 329.72, "epoch": 0.36666666666666664, "grad_norm": NaN, "kl": 396.7952742242813, "learning_rate": 3.9944515511441995e-06, "loss": 15.8718, "reward": -16.43366714477539, "reward_std": 7.244253120422363, "rewards/check_first_pass": -9.88666669845581, "rewards/check_solution": -6.900000200271607, "rewards/check_solution_words": -5.581333435922861, "rewards/check_word_guesses": 5.93433349609375, "step": 1650 }, { "completion_length": 295.6, "epoch": 0.37777777777777777, "grad_norm": NaN, "kl": 2458.04032143116, "learning_rate": 3.915616018037271e-06, "loss": 98.3216, "reward": -16.582167387008667, "reward_std": 6.116619675159455, "rewards/check_first_pass": -9.845500040054322, "rewards/check_solution": -7.29166687965393, "rewards/check_solution_words": -5.515666830142339, "rewards/check_word_guesses": 6.0706668472290035, "step": 1700 }, { "completion_length": 305.44, "epoch": 0.3888888888888889, "grad_norm": NaN, "kl": 6297.921968564987, "learning_rate": 3.834651263967667e-06, "loss": 251.9169, "reward": -17.544833850860595, "reward_std": 6.234307850599289, "rewards/check_first_pass": -9.903166675567627, "rewards/check_solution": -7.258333530426025, "rewards/check_solution_words": -6.216000239551067, "rewards/check_word_guesses": 5.832666802406311, "step": 1750 }, { "completion_length": 293.7, "epoch": 0.4, "grad_norm": NaN, "kl": 4064.312862081528, "learning_rate": 3.7516790676164795e-06, "loss": 162.5725, "reward": -17.36033399581909, "reward_std": 5.197294096946717, "rewards/check_first_pass": -9.983333339691162, "rewards/check_solution": -7.516666932106018, "rewards/check_solution_words": -5.423000110387802, "rewards/check_word_guesses": 5.562666816711426, "step": 1800 } ], "logging_steps": 50, "max_steps": 4500, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 12, "trial_name": null, "trial_params": null }