{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1, "eval_steps": 500, "global_step": 50, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 986.8, "completions/max_terminated_length": 986.8, "completions/mean_length": 859.9, "completions/mean_terminated_length": 859.9, "completions/min_length": 716.6, "completions/min_terminated_length": 716.6, "epoch": 0.01, "frac_reward_zero_std": 0.0, "grad_norm": 0.09593440592288971, "learning_rate": 2e-05, "loss": 0.0005, "num_tokens": 36204.0, "reward": 0.5495000004768371, "reward_std": 0.17061833292245865, "rewards/reward_RULER/mean": 0.5495000004768371, "rewards/reward_RULER/std": 0.17061834409832954, "rewards/reward_short/mean": 0.0, "rewards/reward_short/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 552.0, "completions/max_terminated_length": 552.0, "completions/mean_length": 464.425, "completions/mean_terminated_length": 464.425, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "epoch": 0.02, "frac_reward_zero_std": 0.0, "grad_norm": 0.03579658269882202, "learning_rate": 2e-05, "loss": -0.013, "num_tokens": 57069.0, "reward": 0.6187500059604645, "reward_std": 0.13032747879624368, "rewards/reward_RULER/mean": 0.6187500059604645, "rewards/reward_RULER/std": 0.13032748326659202, "rewards/reward_short/mean": 0.0, "rewards/reward_short/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1370.8, "completions/max_terminated_length": 1370.8, "completions/mean_length": 1088.35, "completions/mean_terminated_length": 1088.35, "completions/min_length": 836.2, "completions/min_terminated_length": 836.2, "epoch": 0.03, "frac_reward_zero_std": 0.0, "grad_norm": 0.030775019899010658, "learning_rate": 2e-05, "loss": -0.0392, "num_tokens": 102323.0, "reward": 0.503250002861023, "reward_std": 0.14202061779797076, "rewards/reward_RULER/mean": 0.503250002861023, "rewards/reward_RULER/std": 0.1420206159353256, "rewards/reward_short/mean": 0.0, "rewards/reward_short/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.6, "completions/max_terminated_length": 735.6, "completions/mean_length": 602.4, "completions/mean_terminated_length": 602.4, "completions/min_length": 494.4, "completions/min_terminated_length": 494.4, "epoch": 0.04, "frac_reward_zero_std": 0.0, "grad_norm": 0.11490475386381149, "learning_rate": 2e-05, "loss": 0.0578, "num_tokens": 129907.0, "reward": 0.5053500056266784, "reward_std": 0.18268517702817916, "rewards/reward_RULER/mean": 0.5053500056266784, "rewards/reward_RULER/std": 0.1826851785182953, "rewards/reward_short/mean": 0.0, "rewards/reward_short/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.8, "completions/max_terminated_length": 774.8, "completions/mean_length": 638.225, "completions/mean_terminated_length": 638.225, "completions/min_length": 552.0, "completions/min_terminated_length": 552.0, "epoch": 0.05, "frac_reward_zero_std": 0.0, "grad_norm": 0.0625438317656517, "learning_rate": 2e-05, "loss": -0.016, "num_tokens": 157732.0, "reward": 0.521749985218048, "reward_std": 0.15968329459428787, "rewards/reward_RULER/mean": 0.521749985218048, "rewards/reward_RULER/std": 0.15968331769108773, "rewards/reward_short/mean": 0.0, "rewards/reward_short/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 277.2, "completions/max_terminated_length": 277.2, "completions/mean_length": 210.925, "completions/mean_terminated_length": 210.925, "completions/min_length": 150.4, "completions/min_terminated_length": 150.4, "epoch": 0.06, "frac_reward_zero_std": 0.0, "grad_norm": 0.12225955724716187, "learning_rate": 2e-05, "loss": -0.0276, "num_tokens": 170273.0, "reward": 0.6259999990463256, "reward_std": 0.23134377971291542, "rewards/reward_RULER/mean": 0.6259999990463256, "rewards/reward_RULER/std": 0.2313437804579735, "rewards/reward_short/mean": 0.0, "rewards/reward_short/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 846.0, "completions/max_terminated_length": 846.0, "completions/mean_length": 736.775, "completions/mean_terminated_length": 736.775, "completions/min_length": 618.4, "completions/min_terminated_length": 618.4, "epoch": 0.07, "frac_reward_zero_std": 0.0, "grad_norm": 0.03475014120340347, "learning_rate": 2e-05, "loss": 0.0223, "num_tokens": 201488.0, "reward": 0.5945000052452087, "reward_std": 0.09596815258264542, "rewards/reward_RULER/mean": 0.5945000052452087, "rewards/reward_RULER/std": 0.09596815742552281, "rewards/reward_short/mean": 0.0, "rewards/reward_short/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 960.6, "completions/max_terminated_length": 960.6, "completions/mean_length": 774.175, "completions/mean_terminated_length": 774.175, "completions/min_length": 655.4, "completions/min_terminated_length": 655.4, "epoch": 0.08, "frac_reward_zero_std": 0.0, "grad_norm": 0.10908406227827072, "learning_rate": 2e-05, "loss": 0.0087, "num_tokens": 235343.0, "reward": 0.5429999977350235, "reward_std": 0.12850210070610046, "rewards/reward_RULER/mean": 0.5429999977350235, "rewards/reward_RULER/std": 0.12850209772586824, "rewards/reward_short/mean": 0.0, "rewards/reward_short/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 500.4, "completions/max_terminated_length": 500.4, "completions/mean_length": 400.825, "completions/mean_terminated_length": 400.825, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.09, "frac_reward_zero_std": 0.0, "grad_norm": 0.1326361447572708, "learning_rate": 2e-05, "loss": -0.0385, "num_tokens": 254880.0, "reward": 0.7205000162124634, "reward_std": 0.16288756281137468, "rewards/reward_RULER/mean": 0.7205000162124634, "rewards/reward_RULER/std": 0.1628875747323036, "rewards/reward_short/mean": 0.0, "rewards/reward_short/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 688.4, "completions/max_terminated_length": 688.4, "completions/mean_length": 601.575, "completions/mean_terminated_length": 601.575, "completions/min_length": 520.0, "completions/min_terminated_length": 520.0, "epoch": 0.1, "frac_reward_zero_std": 0.0, "grad_norm": 0.029254460707306862, "learning_rate": 2e-05, "loss": -0.0005, "num_tokens": 281871.0, "reward": 0.6317500114440918, "reward_std": 0.07507448904216289, "rewards/reward_RULER/mean": 0.6317500114440918, "rewards/reward_RULER/std": 0.0750744953751564, "rewards/reward_short/mean": 0.0, "rewards/reward_short/std": 0.0, "step": 50 } ], "logging_steps": 5, "max_steps": 500, "num_input_tokens_seen": 281871, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }