{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.6666666666666666, "eval_steps": 500, "global_step": 20, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 239.0625, "epoch": 0.06666666666666667, "grad_norm": 146.07957458496094, "kl": 0.0, "learning_rate": 4.965903258506806e-07, "loss": 0.0, "reward": 0.877574372687377, "reward_std": 1.1462637517397525, "rewards/concensus_correctness_reward_func": 0.625, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.10407434520311654, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.023499997798353434, "step": 2 }, { "completion_length": 162.4375, "epoch": 0.13333333333333333, "grad_norm": 44.136566162109375, "kl": 0.004455144211533479, "learning_rate": 4.698684378016222e-07, "loss": 0.0, "reward": 0.35211178776808083, "reward_std": 0.21313007855496835, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.13901804643683136, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.025593749014660716, "step": 4 }, { "completion_length": 372.34375, "epoch": 0.2, "grad_norm": 83.95270538330078, "kl": 0.0029297428282006877, "learning_rate": 4.193203929064353e-07, "loss": 0.0, "reward": 0.7589346559834667, "reward_std": 1.4558915909874486, "rewards/concensus_correctness_reward_func": 0.625, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.1666221939958632, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.09518750477582216, "step": 6 }, { "completion_length": 398.0, "epoch": 0.26666666666666666, "grad_norm": 124.03618621826172, "kl": 0.001172584436062607, "learning_rate": 3.5042385616324236e-07, "loss": 0.0, "reward": 0.24589170649414882, "reward_std": 0.2928572448272462, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0625, "rewards/question_recreation_reward_func": 0.11979795509250835, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06359374988824129, "step": 8 }, { "completion_length": 222.03125, "epoch": 0.3333333333333333, "grad_norm": 22.514238357543945, "kl": 0.002474178050761111, "learning_rate": 2.706448363680831e-07, "loss": 0.0, "reward": 0.3457036364998203, "reward_std": 0.20316840970190242, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.1875, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.13420363652403466, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0240000004414469, "step": 10 }, { "completion_length": 361.8125, "epoch": 0.4, "grad_norm": 31.79580307006836, "kl": 0.0017530460136185866, "learning_rate": 1.886286282148002e-07, "loss": 0.0, "reward": 0.09715361235430464, "reward_std": 0.5041031258151634, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.11099736913456582, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": -0.07634374964982271, "step": 12 }, { "completion_length": 216.8125, "epoch": 0.4666666666666667, "grad_norm": 28.405397415161133, "kl": 0.0028270291222725064, "learning_rate": 1.1326296046939333e-07, "loss": 0.0, "reward": 0.2901497831335291, "reward_std": 0.3146807264929521, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0625, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.13105603493750095, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.09659374854527414, "step": 14 }, { "completion_length": 260.75, "epoch": 0.5333333333333333, "grad_norm": 45.744022369384766, "kl": 0.004110226835109643, "learning_rate": 5.271487265090163e-08, "loss": 0.0, "reward": 0.18092850630637258, "reward_std": 0.1472014883504471, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.0, "rewards/question_recreation_reward_func": 0.11767850333126262, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.06325000035576522, "step": 16 }, { "completion_length": 312.4375, "epoch": 0.6, "grad_norm": 24.293758392333984, "kl": 0.0039952012375579216, "learning_rate": 1.3545689574841341e-08, "loss": 0.0, "reward": 0.3126915064640343, "reward_std": 0.18643183991662227, "rewards/concensus_correctness_reward_func": 0.0, "rewards/consensus_reward_func": 0.0, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.1875, "rewards/question_recreation_reward_func": 0.09559775041998364, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0295937517657876, "step": 18 }, { "completion_length": 203.53125, "epoch": 0.6666666666666666, "grad_norm": 121.38932800292969, "kl": 0.09540724094040343, "learning_rate": 0.0, "loss": 0.0001, "reward": 1.114405733300373, "reward_std": 1.2408277603099123, "rewards/concensus_correctness_reward_func": 0.625, "rewards/consensus_reward_func": 0.125, "rewards/cumulative_reward_2": 0.0, "rewards/final_correctness_reward_func": 0.125, "rewards/question_recreation_reward_func": 0.15249946631956846, "rewards/soft_format_reward_func": 0.0, "rewards/strict_format_reward_func": 0.0, "rewards/xmlcount_reward_func": 0.0869062514975667, "step": 20 }, { "epoch": 0.6666666666666666, "step": 20, "total_flos": 0.0, "train_loss": 1.192515795764848e-05, "train_runtime": 1239.9836, "train_samples_per_second": 0.258, "train_steps_per_second": 0.016 } ], "logging_steps": 2, "max_steps": 20, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }