| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.9994879672299027, | |
| "eval_steps": 100, | |
| "global_step": 488, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 839.4509201049805, | |
| "epoch": 0.0005049548696585243, | |
| "grad_norm": 0.2548963129520416, | |
| "kl": 0.0, | |
| "learning_rate": 1.5151515151515152e-08, | |
| "loss": 0.1299, | |
| "reward": 0.3950892947614193, | |
| "reward_std": 0.3455488122999668, | |
| "rewards/accuracy_reward": 0.3950892947614193, | |
| "rewards/format_reward": 0.0, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 847.5413093566895, | |
| "epoch": 0.0025247743482926213, | |
| "grad_norm": 0.17471390962600708, | |
| "kl": 0.0001666247844696045, | |
| "learning_rate": 7.575757575757576e-08, | |
| "loss": 0.1225, | |
| "reward": 0.34858631575480103, | |
| "reward_std": 0.3048761789686978, | |
| "rewards/accuracy_reward": 0.34858631575480103, | |
| "rewards/format_reward": 0.0, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 862.8038803100586, | |
| "epoch": 0.005049548696585243, | |
| "grad_norm": 0.2465265542268753, | |
| "kl": 0.00022046566009521485, | |
| "learning_rate": 1.5151515151515152e-07, | |
| "loss": 0.138, | |
| "reward": 0.36235119625926016, | |
| "reward_std": 0.31897980310022833, | |
| "rewards/accuracy_reward": 0.36235119625926016, | |
| "rewards/format_reward": 0.0, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 858.6939163208008, | |
| "epoch": 0.0075743230448778644, | |
| "grad_norm": 0.16985075175762177, | |
| "kl": 0.00023434162139892578, | |
| "learning_rate": 2.2727272727272729e-07, | |
| "loss": 0.1388, | |
| "reward": 0.3501488171517849, | |
| "reward_std": 0.3107242323458195, | |
| "rewards/accuracy_reward": 0.3500000074505806, | |
| "rewards/format_reward": 0.0001488095265813172, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 871.340788269043, | |
| "epoch": 0.010099097393170485, | |
| "grad_norm": 0.19429659843444824, | |
| "kl": 0.0002512931823730469, | |
| "learning_rate": 3.0303030303030305e-07, | |
| "loss": 0.1322, | |
| "reward": 0.3501488149166107, | |
| "reward_std": 0.3262017160654068, | |
| "rewards/accuracy_reward": 0.3501488149166107, | |
| "rewards/format_reward": 0.0, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 865.7901931762696, | |
| "epoch": 0.012623871741463106, | |
| "grad_norm": 0.14376680552959442, | |
| "kl": 0.0004102230072021484, | |
| "learning_rate": 3.787878787878788e-07, | |
| "loss": 0.1202, | |
| "reward": 0.3611607223749161, | |
| "reward_std": 0.31936271116137505, | |
| "rewards/accuracy_reward": 0.3611607223749161, | |
| "rewards/format_reward": 0.0, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 864.0622192382813, | |
| "epoch": 0.015148646089755729, | |
| "grad_norm": 0.18397963047027588, | |
| "kl": 0.0009374618530273438, | |
| "learning_rate": 4.5454545454545457e-07, | |
| "loss": 0.1329, | |
| "reward": 0.3620535783469677, | |
| "reward_std": 0.3106327898800373, | |
| "rewards/accuracy_reward": 0.3620535783469677, | |
| "rewards/format_reward": 0.0, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 869.1050796508789, | |
| "epoch": 0.01767342043804835, | |
| "grad_norm": 0.14349086582660675, | |
| "kl": 0.005994081497192383, | |
| "learning_rate": 5.303030303030303e-07, | |
| "loss": 0.1382, | |
| "reward": 0.37440476976335046, | |
| "reward_std": 0.3009735390543938, | |
| "rewards/accuracy_reward": 0.3742559600621462, | |
| "rewards/format_reward": 0.0001488095265813172, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 856.7110305786133, | |
| "epoch": 0.02019819478634097, | |
| "grad_norm": 1.4588767290115356, | |
| "kl": 0.0035955429077148436, | |
| "learning_rate": 6.060606060606061e-07, | |
| "loss": 0.1527, | |
| "reward": 0.413244054839015, | |
| "reward_std": 0.301710956916213, | |
| "rewards/accuracy_reward": 0.413244054839015, | |
| "rewards/format_reward": 0.0, | |
| "step": 40 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 896.8750152587891, | |
| "epoch": 0.02272296913463359, | |
| "grad_norm": 0.15681959688663483, | |
| "kl": 0.009355926513671875, | |
| "learning_rate": 6.818181818181818e-07, | |
| "loss": 0.1496, | |
| "reward": 0.4224702470004559, | |
| "reward_std": 0.3048379421234131, | |
| "rewards/accuracy_reward": 0.4224702470004559, | |
| "rewards/format_reward": 0.0, | |
| "step": 45 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 843.5924240112305, | |
| "epoch": 0.025247743482926212, | |
| "grad_norm": 0.14472152292728424, | |
| "kl": 0.00784759521484375, | |
| "learning_rate": 7.575757575757576e-07, | |
| "loss": 0.1563, | |
| "reward": 0.44523810148239135, | |
| "reward_std": 0.28487018793821334, | |
| "rewards/accuracy_reward": 0.44523810148239135, | |
| "rewards/format_reward": 0.0, | |
| "step": 50 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 849.950163269043, | |
| "epoch": 0.027772517831218833, | |
| "grad_norm": 2.2360219955444336, | |
| "kl": 0.026047897338867188, | |
| "learning_rate": 8.333333333333334e-07, | |
| "loss": 0.1656, | |
| "reward": 0.48050596117973327, | |
| "reward_std": 0.2785939615219831, | |
| "rewards/accuracy_reward": 0.48050596117973327, | |
| "rewards/format_reward": 0.0, | |
| "step": 55 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 835.623080444336, | |
| "epoch": 0.030297292179511458, | |
| "grad_norm": 0.1463390588760376, | |
| "kl": 0.025413894653320314, | |
| "learning_rate": 9.090909090909091e-07, | |
| "loss": 0.1391, | |
| "reward": 0.49255953207612035, | |
| "reward_std": 0.2822702344506979, | |
| "rewards/accuracy_reward": 0.49255953207612035, | |
| "rewards/format_reward": 0.0, | |
| "step": 60 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 822.6875152587891, | |
| "epoch": 0.032822066527804075, | |
| "grad_norm": 0.23362481594085693, | |
| "kl": 0.012601470947265625, | |
| "learning_rate": 9.84848484848485e-07, | |
| "loss": 0.1345, | |
| "reward": 0.4882440596818924, | |
| "reward_std": 0.27538601867854595, | |
| "rewards/accuracy_reward": 0.4882440596818924, | |
| "rewards/format_reward": 0.0, | |
| "step": 65 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 799.619660949707, | |
| "epoch": 0.0353468408760967, | |
| "grad_norm": 0.4000104069709778, | |
| "kl": 0.008473968505859375, | |
| "learning_rate": 1.0606060606060606e-06, | |
| "loss": 0.1164, | |
| "reward": 0.4852678671479225, | |
| "reward_std": 0.2619207199662924, | |
| "rewards/accuracy_reward": 0.4852678671479225, | |
| "rewards/format_reward": 0.0, | |
| "step": 70 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 772.0263519287109, | |
| "epoch": 0.03787161522438932, | |
| "grad_norm": 2.9015414714813232, | |
| "kl": 0.016247940063476563, | |
| "learning_rate": 1.1363636363636364e-06, | |
| "loss": 0.1033, | |
| "reward": 0.504464291036129, | |
| "reward_std": 0.24284302350133657, | |
| "rewards/accuracy_reward": 0.504464291036129, | |
| "rewards/format_reward": 0.0, | |
| "step": 75 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 781.6562637329101, | |
| "epoch": 0.04039638957268194, | |
| "grad_norm": 0.23751582205295563, | |
| "kl": 0.008544158935546876, | |
| "learning_rate": 1.2121212121212122e-06, | |
| "loss": 0.0983, | |
| "reward": 0.5026785813271999, | |
| "reward_std": 0.24498660415410994, | |
| "rewards/accuracy_reward": 0.5026785813271999, | |
| "rewards/format_reward": 0.0, | |
| "step": 80 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 759.9604278564453, | |
| "epoch": 0.042921163920974566, | |
| "grad_norm": 4.397524356842041, | |
| "kl": 0.011675262451171875, | |
| "learning_rate": 1.2878787878787878e-06, | |
| "loss": 0.0924, | |
| "reward": 0.5069940604269505, | |
| "reward_std": 0.2519028801470995, | |
| "rewards/accuracy_reward": 0.5069940604269505, | |
| "rewards/format_reward": 0.0, | |
| "step": 85 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 776.5525421142578, | |
| "epoch": 0.04544593826926718, | |
| "grad_norm": 0.5079767107963562, | |
| "kl": 0.0114349365234375, | |
| "learning_rate": 1.3636363636363636e-06, | |
| "loss": 0.0933, | |
| "reward": 0.5169642955064774, | |
| "reward_std": 0.26492451392114164, | |
| "rewards/accuracy_reward": 0.5169642955064774, | |
| "rewards/format_reward": 0.0, | |
| "step": 90 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 734.737516784668, | |
| "epoch": 0.04797071261755981, | |
| "grad_norm": 11.932097434997559, | |
| "kl": 0.019676971435546874, | |
| "learning_rate": 1.4393939393939394e-06, | |
| "loss": 0.0666, | |
| "reward": 0.530654776096344, | |
| "reward_std": 0.23760274276137353, | |
| "rewards/accuracy_reward": 0.530654776096344, | |
| "rewards/format_reward": 0.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.050495486965852425, | |
| "grad_norm": 1.00216805934906, | |
| "learning_rate": 1.5151515151515152e-06, | |
| "loss": 0.0734, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.050495486965852425, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 795.8742891868449, | |
| "eval_kl": 0.01560433903095885, | |
| "eval_loss": 0.05890418216586113, | |
| "eval_reward": 0.4977447599727915, | |
| "eval_reward_std": 0.228113556685655, | |
| "eval_rewards/accuracy_reward": 0.4977447599727915, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 5093.0388, | |
| "eval_samples_per_second": 0.754, | |
| "eval_steps_per_second": 0.005, | |
| "step": 100 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 733.3500892639161, | |
| "epoch": 0.05302026131414505, | |
| "grad_norm": 0.22215726971626282, | |
| "kl": 0.013599395751953125, | |
| "learning_rate": 1.5909090909090908e-06, | |
| "loss": 0.0612, | |
| "reward": 0.5424851287156344, | |
| "reward_std": 0.23987281071022154, | |
| "rewards/accuracy_reward": 0.5424851287156344, | |
| "rewards/format_reward": 0.0, | |
| "step": 105 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 753.378581237793, | |
| "epoch": 0.05554503566243767, | |
| "grad_norm": 0.3153633177280426, | |
| "kl": 0.01254425048828125, | |
| "learning_rate": 1.6666666666666669e-06, | |
| "loss": 0.0519, | |
| "reward": 0.5129464380443096, | |
| "reward_std": 0.23213699869811535, | |
| "rewards/accuracy_reward": 0.5129464380443096, | |
| "rewards/format_reward": 0.0, | |
| "step": 110 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 728.7988174438476, | |
| "epoch": 0.05806981001073029, | |
| "grad_norm": 0.5135952234268188, | |
| "kl": 0.0145751953125, | |
| "learning_rate": 1.7424242424242423e-06, | |
| "loss": 0.0601, | |
| "reward": 0.5419642932713031, | |
| "reward_std": 0.23888240195810795, | |
| "rewards/accuracy_reward": 0.5419642932713031, | |
| "rewards/format_reward": 0.0, | |
| "step": 115 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 718.3470321655274, | |
| "epoch": 0.060594584359022916, | |
| "grad_norm": 32253.96484375, | |
| "kl": 8.86512451171875, | |
| "learning_rate": 1.8181818181818183e-06, | |
| "loss": 0.462, | |
| "reward": 0.5446428656578064, | |
| "reward_std": 0.21705915424972771, | |
| "rewards/accuracy_reward": 0.5446428656578064, | |
| "rewards/format_reward": 0.0, | |
| "step": 120 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 719.036474609375, | |
| "epoch": 0.06311935870731554, | |
| "grad_norm": 0.20379751920700073, | |
| "kl": 0.01573638916015625, | |
| "learning_rate": 1.893939393939394e-06, | |
| "loss": 0.066, | |
| "reward": 0.5467261992394924, | |
| "reward_std": 0.22874139733612536, | |
| "rewards/accuracy_reward": 0.5467261992394924, | |
| "rewards/format_reward": 0.0, | |
| "step": 125 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 766.3154907226562, | |
| "epoch": 0.06564413305560815, | |
| "grad_norm": 0.21040405333042145, | |
| "kl": 0.02525787353515625, | |
| "learning_rate": 1.96969696969697e-06, | |
| "loss": 0.0674, | |
| "reward": 0.514732152968645, | |
| "reward_std": 0.2296198945492506, | |
| "rewards/accuracy_reward": 0.514732152968645, | |
| "rewards/format_reward": 0.0, | |
| "step": 130 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 772.217121887207, | |
| "epoch": 0.06816890740390077, | |
| "grad_norm": 0.392875999212265, | |
| "kl": 0.033099365234375, | |
| "learning_rate": 2.0454545454545453e-06, | |
| "loss": 0.1037, | |
| "reward": 0.5169642955064774, | |
| "reward_std": 0.2294642563909292, | |
| "rewards/accuracy_reward": 0.5169642955064774, | |
| "rewards/format_reward": 0.0, | |
| "step": 135 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 845.8943603515625, | |
| "epoch": 0.0706936817521934, | |
| "grad_norm": 19866.705078125, | |
| "kl": 3.577685546875, | |
| "learning_rate": 2.121212121212121e-06, | |
| "loss": 0.6622, | |
| "reward": 0.48511905670166017, | |
| "reward_std": 0.2477981884032488, | |
| "rewards/accuracy_reward": 0.48511905670166017, | |
| "rewards/format_reward": 0.0, | |
| "step": 140 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 844.8823059082031, | |
| "epoch": 0.07321845610048602, | |
| "grad_norm": 0.47434961795806885, | |
| "kl": 0.17655029296875, | |
| "learning_rate": 2.1969696969696974e-06, | |
| "loss": 0.146, | |
| "reward": 0.4336309637874365, | |
| "reward_std": 0.2661529861390591, | |
| "rewards/accuracy_reward": 0.4336309637874365, | |
| "rewards/format_reward": 0.0, | |
| "step": 145 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 758.0837905883789, | |
| "epoch": 0.07574323044877863, | |
| "grad_norm": 9.017133712768555, | |
| "kl": 0.224951171875, | |
| "learning_rate": 2.2727272727272728e-06, | |
| "loss": 0.1059, | |
| "reward": 0.44047619961202145, | |
| "reward_std": 0.2847708839923143, | |
| "rewards/accuracy_reward": 0.44047619961202145, | |
| "rewards/format_reward": 0.0, | |
| "step": 150 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 679.5257614135742, | |
| "epoch": 0.07826800479707126, | |
| "grad_norm": 15.091608047485352, | |
| "kl": 0.21767578125, | |
| "learning_rate": 2.3484848484848486e-06, | |
| "loss": 0.0639, | |
| "reward": 0.4681547723710537, | |
| "reward_std": 0.26657171882689, | |
| "rewards/accuracy_reward": 0.4681547723710537, | |
| "rewards/format_reward": 0.0, | |
| "step": 155 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 594.3864700317383, | |
| "epoch": 0.08079277914536388, | |
| "grad_norm": 18.641502380371094, | |
| "kl": 0.4121337890625, | |
| "learning_rate": 2.4242424242424244e-06, | |
| "loss": 0.0134, | |
| "reward": 0.4694940596818924, | |
| "reward_std": 0.26797937490046025, | |
| "rewards/accuracy_reward": 0.4694940596818924, | |
| "rewards/format_reward": 0.0, | |
| "step": 160 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 573.9558166503906, | |
| "epoch": 0.08331755349365651, | |
| "grad_norm": 3.902953624725342, | |
| "kl": 0.3707763671875, | |
| "learning_rate": 2.5e-06, | |
| "loss": -0.0055, | |
| "reward": 0.4639881007373333, | |
| "reward_std": 0.24555196836590767, | |
| "rewards/accuracy_reward": 0.4639881007373333, | |
| "rewards/format_reward": 0.0, | |
| "step": 165 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 567.443019104004, | |
| "epoch": 0.08584232784194913, | |
| "grad_norm": 4.564860820770264, | |
| "kl": 21.7295166015625, | |
| "learning_rate": 2.5757575757575756e-06, | |
| "loss": 1.4361, | |
| "reward": 0.5007440589368344, | |
| "reward_std": 0.25035002939403056, | |
| "rewards/accuracy_reward": 0.5007440589368344, | |
| "rewards/format_reward": 0.0, | |
| "step": 170 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 573.968465423584, | |
| "epoch": 0.08836710219024174, | |
| "grad_norm": 2.453469753265381, | |
| "kl": 0.58544921875, | |
| "learning_rate": 2.651515151515152e-06, | |
| "loss": -0.0241, | |
| "reward": 0.479761915653944, | |
| "reward_std": 0.24731975235044956, | |
| "rewards/accuracy_reward": 0.479761915653944, | |
| "rewards/format_reward": 0.0, | |
| "step": 175 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 592.1180152893066, | |
| "epoch": 0.09089187653853437, | |
| "grad_norm": 7.268795967102051, | |
| "kl": 1.03837890625, | |
| "learning_rate": 2.7272727272727272e-06, | |
| "loss": 0.0296, | |
| "reward": 0.5111607246100902, | |
| "reward_std": 0.25646619591861963, | |
| "rewards/accuracy_reward": 0.5111607246100902, | |
| "rewards/format_reward": 0.0, | |
| "step": 180 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 609.6851280212402, | |
| "epoch": 0.09341665088682699, | |
| "grad_norm": 3.061584234237671, | |
| "kl": 0.851953125, | |
| "learning_rate": 2.803030303030303e-06, | |
| "loss": 0.017, | |
| "reward": 0.4995535835623741, | |
| "reward_std": 0.23969481624662875, | |
| "rewards/accuracy_reward": 0.4995535835623741, | |
| "rewards/format_reward": 0.0, | |
| "step": 185 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 605.6526893615722, | |
| "epoch": 0.09594142523511962, | |
| "grad_norm": 15.86893367767334, | |
| "kl": 2.0413330078125, | |
| "learning_rate": 2.878787878787879e-06, | |
| "loss": 0.0528, | |
| "reward": 0.5168154835700989, | |
| "reward_std": 0.24480598308146, | |
| "rewards/accuracy_reward": 0.5168154835700989, | |
| "rewards/format_reward": 0.0, | |
| "step": 190 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 589.4162292480469, | |
| "epoch": 0.09846619958341224, | |
| "grad_norm": 51.3820915222168, | |
| "kl": 1.005224609375, | |
| "learning_rate": 2.9545454545454547e-06, | |
| "loss": 0.0164, | |
| "reward": 0.5026785813271999, | |
| "reward_std": 0.24141334034502507, | |
| "rewards/accuracy_reward": 0.5026785813271999, | |
| "rewards/format_reward": 0.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.10099097393170485, | |
| "grad_norm": 2.8177883625030518, | |
| "learning_rate": 2.999990675938041e-06, | |
| "loss": 0.0187, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.10099097393170485, | |
| "eval_clip_ratio": 0.0, | |
| "eval_completion_length": 755.0453377480833, | |
| "eval_kl": 2.4657899844720497, | |
| "eval_loss": 0.17787984013557434, | |
| "eval_reward": 0.4527506747786303, | |
| "eval_reward_std": 0.23182548258615576, | |
| "eval_rewards/accuracy_reward": 0.4527506747786303, | |
| "eval_rewards/format_reward": 0.0, | |
| "eval_runtime": 5012.4785, | |
| "eval_samples_per_second": 0.766, | |
| "eval_steps_per_second": 0.005, | |
| "step": 200 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1894.196615600586, | |
| "epoch": 0.4198668714797747, | |
| "grad_norm": 2.7988369464874268, | |
| "kl": 1.30494384765625, | |
| "learning_rate": 2.1584141861097513e-06, | |
| "loss": 0.0839, | |
| "reward": 0.07008928679861129, | |
| "reward_std": 0.05558663532137871, | |
| "rewards/accuracy_reward": 0.07008928679861129, | |
| "rewards/format_reward": 0.0, | |
| "step": 205 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1115.846598815918, | |
| "epoch": 0.43010752688172044, | |
| "grad_norm": 4.760381698608398, | |
| "kl": 0.363568115234375, | |
| "learning_rate": 2.1097780693886946e-06, | |
| "loss": 0.03, | |
| "reward": 0.11488095466047525, | |
| "reward_std": 0.09430552553385496, | |
| "rewards/accuracy_reward": 0.11488095466047525, | |
| "rewards/format_reward": 0.0, | |
| "step": 210 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 741.814892578125, | |
| "epoch": 0.4403481822836662, | |
| "grad_norm": 1.493429183959961, | |
| "kl": 0.41728515625, | |
| "learning_rate": 2.060361338950506e-06, | |
| "loss": 0.0096, | |
| "reward": 0.10699404974002391, | |
| "reward_std": 0.08532515289261937, | |
| "rewards/accuracy_reward": 0.10699404974002391, | |
| "rewards/format_reward": 0.0, | |
| "step": 215 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 757.3077529907226, | |
| "epoch": 0.45058883768561186, | |
| "grad_norm": 0.6104231476783752, | |
| "kl": 0.28935546875, | |
| "learning_rate": 2.010227256134962e-06, | |
| "loss": 0.0056, | |
| "reward": 0.10029762159101666, | |
| "reward_std": 0.08028191016055644, | |
| "rewards/accuracy_reward": 0.10029762159101666, | |
| "rewards/format_reward": 0.0, | |
| "step": 220 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 796.1696578979493, | |
| "epoch": 0.4608294930875576, | |
| "grad_norm": 3.8080809116363525, | |
| "kl": 0.3931396484375, | |
| "learning_rate": 1.9594400006079216e-06, | |
| "loss": 0.0098, | |
| "reward": 0.11964285904541612, | |
| "reward_std": 0.09335053358227015, | |
| "rewards/accuracy_reward": 0.11964285904541612, | |
| "rewards/format_reward": 0.0, | |
| "step": 225 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 872.2796279907227, | |
| "epoch": 0.47107014848950335, | |
| "grad_norm": 2.8400909900665283, | |
| "kl": 0.3711669921875, | |
| "learning_rate": 1.9080645882010618e-06, | |
| "loss": 0.0085, | |
| "reward": 0.1058035738300532, | |
| "reward_std": 0.08510495922528208, | |
| "rewards/accuracy_reward": 0.1058035738300532, | |
| "rewards/format_reward": 0.0, | |
| "step": 230 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 950.6646041870117, | |
| "epoch": 0.48131080389144903, | |
| "grad_norm": 1.133971929550171, | |
| "kl": 0.3427490234375, | |
| "learning_rate": 1.8561667876811873e-06, | |
| "loss": 0.0169, | |
| "reward": 0.1235119073651731, | |
| "reward_std": 0.08968696529045701, | |
| "rewards/accuracy_reward": 0.1235119073651731, | |
| "rewards/format_reward": 0.0, | |
| "step": 235 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 973.8951126098633, | |
| "epoch": 0.4915514592933948, | |
| "grad_norm": 0.8387630581855774, | |
| "kl": 0.418408203125, | |
| "learning_rate": 1.8038130365556615e-06, | |
| "loss": 0.0219, | |
| "reward": 0.12857143091969192, | |
| "reward_std": 0.10588475931435823, | |
| "rewards/accuracy_reward": 0.12857143091969192, | |
| "rewards/format_reward": 0.0, | |
| "step": 240 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 962.3290390014648, | |
| "epoch": 0.5017921146953405, | |
| "grad_norm": 2.0839455127716064, | |
| "kl": 0.6023193359375, | |
| "learning_rate": 1.751070356021742e-06, | |
| "loss": 0.0248, | |
| "reward": 0.12485119318589569, | |
| "reward_std": 0.10606874478980899, | |
| "rewards/accuracy_reward": 0.12485119318589569, | |
| "rewards/format_reward": 0.0, | |
| "step": 245 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1087.3797821044923, | |
| "epoch": 0.5120327700972862, | |
| "grad_norm": 4.897603988647461, | |
| "kl": 0.297705078125, | |
| "learning_rate": 1.6980062651687004e-06, | |
| "loss": 0.0245, | |
| "reward": 0.12886905036866664, | |
| "reward_std": 0.10398078355938196, | |
| "rewards/accuracy_reward": 0.12886905036866664, | |
| "rewards/format_reward": 0.0, | |
| "step": 250 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1061.2479339599608, | |
| "epoch": 0.522273425499232, | |
| "grad_norm": 10.860794067382812, | |
| "kl": 0.60859375, | |
| "learning_rate": 1.6446886945425608e-06, | |
| "loss": 0.0344, | |
| "reward": 0.14122024178504944, | |
| "reward_std": 0.10650016171857715, | |
| "rewards/accuracy_reward": 0.14122024178504944, | |
| "rewards/format_reward": 0.0, | |
| "step": 255 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1152.5089492797852, | |
| "epoch": 0.5325140809011777, | |
| "grad_norm": 1.4874508380889893, | |
| "kl": 0.3950927734375, | |
| "learning_rate": 1.5911858991841063e-06, | |
| "loss": 0.0344, | |
| "reward": 0.1450892877765, | |
| "reward_std": 0.10981706054881216, | |
| "rewards/accuracy_reward": 0.1450892877765, | |
| "rewards/format_reward": 0.0, | |
| "step": 260 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1092.9384155273438, | |
| "epoch": 0.5427547363031234, | |
| "grad_norm": 1.4266042709350586, | |
| "kl": 0.861279296875, | |
| "learning_rate": 1.537566371251482e-06, | |
| "loss": 0.0509, | |
| "reward": 0.14776785927824676, | |
| "reward_std": 0.11378104742616416, | |
| "rewards/accuracy_reward": 0.14776785927824676, | |
| "rewards/format_reward": 0.0, | |
| "step": 265 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1142.4348419189453, | |
| "epoch": 0.5529953917050692, | |
| "grad_norm": 4.305612564086914, | |
| "kl": 0.542431640625, | |
| "learning_rate": 1.4838987523392473e-06, | |
| "loss": 0.0374, | |
| "reward": 0.149107145704329, | |
| "reward_std": 0.10749768582172692, | |
| "rewards/accuracy_reward": 0.149107145704329, | |
| "rewards/format_reward": 0.0, | |
| "step": 270 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1178.0220428466796, | |
| "epoch": 0.5632360471070148, | |
| "grad_norm": 1.8183989524841309, | |
| "kl": 0.477099609375, | |
| "learning_rate": 1.4302517456061282e-06, | |
| "loss": 0.0336, | |
| "reward": 0.14880952723324298, | |
| "reward_std": 0.115222292765975, | |
| "rewards/accuracy_reward": 0.14880952723324298, | |
| "rewards/format_reward": 0.0, | |
| "step": 275 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1173.8620773315429, | |
| "epoch": 0.5734767025089605, | |
| "grad_norm": 3.585653781890869, | |
| "kl": 0.5623046875, | |
| "learning_rate": 1.3766940278239555e-06, | |
| "loss": 0.0363, | |
| "reward": 0.13660714430734516, | |
| "reward_std": 0.10423116614110768, | |
| "rewards/accuracy_reward": 0.13660714430734516, | |
| "rewards/format_reward": 0.0, | |
| "step": 280 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1201.4131134033203, | |
| "epoch": 0.5837173579109063, | |
| "grad_norm": 1.5504530668258667, | |
| "kl": 0.548388671875, | |
| "learning_rate": 1.3232941614603844e-06, | |
| "loss": 0.0346, | |
| "reward": 0.14375000298023224, | |
| "reward_std": 0.12105894400738179, | |
| "rewards/accuracy_reward": 0.14375000298023224, | |
| "rewards/format_reward": 0.0, | |
| "step": 285 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1276.2178771972656, | |
| "epoch": 0.593958013312852, | |
| "grad_norm": 1.5955944061279297, | |
| "kl": 0.5833984375, | |
| "learning_rate": 1.270120506907939e-06, | |
| "loss": 0.0347, | |
| "reward": 0.13065476389601827, | |
| "reward_std": 0.0991502583026886, | |
| "rewards/accuracy_reward": 0.13065476389601827, | |
| "rewards/format_reward": 0.0, | |
| "step": 290 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1367.1632751464845, | |
| "epoch": 0.6041986687147978, | |
| "grad_norm": 6.252760410308838, | |
| "kl": 0.5171875, | |
| "learning_rate": 1.2172411349717491e-06, | |
| "loss": 0.0357, | |
| "reward": 0.133630954567343, | |
| "reward_std": 0.11369595509022475, | |
| "rewards/accuracy_reward": 0.133630954567343, | |
| "rewards/format_reward": 0.0, | |
| "step": 295 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1294.0861907958983, | |
| "epoch": 0.6144393241167435, | |
| "grad_norm": 1.7657579183578491, | |
| "kl": 0.9818359375, | |
| "learning_rate": 1.1647237397280027e-06, | |
| "loss": 0.0553, | |
| "reward": 0.11979166907258332, | |
| "reward_std": 0.10067843548022211, | |
| "rewards/accuracy_reward": 0.11979166907258332, | |
| "rewards/format_reward": 0.0, | |
| "step": 300 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1419.5473419189452, | |
| "epoch": 0.6246799795186891, | |
| "grad_norm": 2.0053577423095703, | |
| "kl": 0.32607421875, | |
| "learning_rate": 1.112635551864672e-06, | |
| "loss": 0.0249, | |
| "reward": 0.1291666693985462, | |
| "reward_std": 0.09413350112736225, | |
| "rewards/accuracy_reward": 0.1291666693985462, | |
| "rewards/format_reward": 0.0, | |
| "step": 305 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1386.4494354248047, | |
| "epoch": 0.6349206349206349, | |
| "grad_norm": 3.0205366611480713, | |
| "kl": 0.508544921875, | |
| "learning_rate": 1.061043252615451e-06, | |
| "loss": 0.0343, | |
| "reward": 0.13586309785023332, | |
| "reward_std": 0.10630648182705045, | |
| "rewards/accuracy_reward": 0.13586309785023332, | |
| "rewards/format_reward": 0.0, | |
| "step": 310 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1371.6342407226562, | |
| "epoch": 0.6451612903225806, | |
| "grad_norm": 1.5079602003097534, | |
| "kl": 0.572265625, | |
| "learning_rate": 1.0100128883970838e-06, | |
| "loss": 0.0347, | |
| "reward": 0.13020833493210376, | |
| "reward_std": 0.104894710611552, | |
| "rewards/accuracy_reward": 0.13020833493210376, | |
| "rewards/format_reward": 0.0, | |
| "step": 315 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1415.7412506103515, | |
| "epoch": 0.6554019457245264, | |
| "grad_norm": 1.1102583408355713, | |
| "kl": 0.466064453125, | |
| "learning_rate": 9.596097862593594e-07, | |
| "loss": 0.0346, | |
| "reward": 0.11889881128445268, | |
| "reward_std": 0.10287130940705538, | |
| "rewards/accuracy_reward": 0.11889881128445268, | |
| "rewards/format_reward": 0.0, | |
| "step": 320 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1400.208950805664, | |
| "epoch": 0.6656426011264721, | |
| "grad_norm": 1.7711780071258545, | |
| "kl": 0.5921875, | |
| "learning_rate": 9.098984702560116e-07, | |
| "loss": 0.038, | |
| "reward": 0.12872024006210267, | |
| "reward_std": 0.10296571510843933, | |
| "rewards/accuracy_reward": 0.12872024006210267, | |
| "rewards/format_reward": 0.0, | |
| "step": 325 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1419.4535919189452, | |
| "epoch": 0.6758832565284179, | |
| "grad_norm": 1.5115975141525269, | |
| "kl": 0.464794921875, | |
| "learning_rate": 8.609425788435818e-07, | |
| "loss": 0.0352, | |
| "reward": 0.11815476433839649, | |
| "reward_std": 0.09560534581542016, | |
| "rewards/accuracy_reward": 0.11815476433839649, | |
| "rewards/format_reward": 0.0, | |
| "step": 330 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1371.5564270019531, | |
| "epoch": 0.6861239119303636, | |
| "grad_norm": 1.2688425779342651, | |
| "kl": 0.593115234375, | |
| "learning_rate": 8.128047834139907e-07, | |
| "loss": 0.0373, | |
| "reward": 0.1520833361428231, | |
| "reward_std": 0.10912553807720542, | |
| "rewards/accuracy_reward": 0.1520833361428231, | |
| "rewards/format_reward": 0.0, | |
| "step": 335 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1391.9670043945312, | |
| "epoch": 0.6963645673323092, | |
| "grad_norm": 1.829864740371704, | |
| "kl": 0.538720703125, | |
| "learning_rate": 7.65546708065104e-07, | |
| "loss": 0.0385, | |
| "reward": 0.13824404990300537, | |
| "reward_std": 0.11598918633535504, | |
| "rewards/accuracy_reward": 0.13824404990300537, | |
| "rewards/format_reward": 0.0, | |
| "step": 340 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1407.3750274658203, | |
| "epoch": 0.706605222734255, | |
| "grad_norm": 2.415090322494507, | |
| "kl": 0.52529296875, | |
| "learning_rate": 7.192288507120073e-07, | |
| "loss": 0.0326, | |
| "reward": 0.12991071683354677, | |
| "reward_std": 0.09881549612618983, | |
| "rewards/accuracy_reward": 0.12991071683354677, | |
| "rewards/format_reward": 0.0, | |
| "step": 345 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1416.4004791259765, | |
| "epoch": 0.7168458781362007, | |
| "grad_norm": 2.3220245838165283, | |
| "kl": 0.397216796875, | |
| "learning_rate": 6.739105056399712e-07, | |
| "loss": 0.0236, | |
| "reward": 0.14464285979047417, | |
| "reward_std": 0.1005579138174653, | |
| "rewards/accuracy_reward": 0.14464285979047417, | |
| "rewards/format_reward": 0.0, | |
| "step": 350 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1480.5619354248047, | |
| "epoch": 0.7270865335381465, | |
| "grad_norm": 2.364630699157715, | |
| "kl": 0.56025390625, | |
| "learning_rate": 6.296496875982565e-07, | |
| "loss": 0.0327, | |
| "reward": 0.11845238371752202, | |
| "reward_std": 0.089572025090456, | |
| "rewards/accuracy_reward": 0.11845238371752202, | |
| "rewards/format_reward": 0.0, | |
| "step": 355 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1497.1189331054688, | |
| "epoch": 0.7373271889400922, | |
| "grad_norm": 1.7453265190124512, | |
| "kl": 0.359423828125, | |
| "learning_rate": 5.86503057531935e-07, | |
| "loss": 0.0196, | |
| "reward": 0.1288690499495715, | |
| "reward_std": 0.09691634746268392, | |
| "rewards/accuracy_reward": 0.1288690499495715, | |
| "rewards/format_reward": 0.0, | |
| "step": 360 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1490.572946166992, | |
| "epoch": 0.7475678443420379, | |
| "grad_norm": 2.5578675270080566, | |
| "kl": 0.44873046875, | |
| "learning_rate": 5.445258500467891e-07, | |
| "loss": 0.0306, | |
| "reward": 0.13690476482734085, | |
| "reward_std": 0.09427342982962728, | |
| "rewards/accuracy_reward": 0.13690476482734085, | |
| "rewards/format_reward": 0.0, | |
| "step": 365 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1448.5931884765625, | |
| "epoch": 0.7578084997439836, | |
| "grad_norm": 1.473059058189392, | |
| "kl": 0.623779296875, | |
| "learning_rate": 5.037718027001601e-07, | |
| "loss": 0.0447, | |
| "reward": 0.12886904999613763, | |
| "reward_std": 0.10351952081546187, | |
| "rewards/accuracy_reward": 0.12886904999613763, | |
| "rewards/format_reward": 0.0, | |
| "step": 370 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1525.3043518066406, | |
| "epoch": 0.7680491551459293, | |
| "grad_norm": 2.5979256629943848, | |
| "kl": 0.438134765625, | |
| "learning_rate": 4.642930872082603e-07, | |
| "loss": 0.0294, | |
| "reward": 0.1340773832052946, | |
| "reward_std": 0.09693685229867696, | |
| "rewards/accuracy_reward": 0.1340773832052946, | |
| "rewards/format_reward": 0.0, | |
| "step": 375 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1548.1570190429688, | |
| "epoch": 0.778289810547875, | |
| "grad_norm": 1.8802986145019531, | |
| "kl": 0.41083984375, | |
| "learning_rate": 4.261402426580037e-07, | |
| "loss": 0.0239, | |
| "reward": 0.1302083361428231, | |
| "reward_std": 0.09426894560456275, | |
| "rewards/accuracy_reward": 0.1302083361428231, | |
| "rewards/format_reward": 0.0, | |
| "step": 380 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1499.0522583007812, | |
| "epoch": 0.7885304659498208, | |
| "grad_norm": 1.5898677110671997, | |
| "kl": 0.464453125, | |
| "learning_rate": 3.893621108088737e-07, | |
| "loss": 0.0377, | |
| "reward": 0.13675595503300428, | |
| "reward_std": 0.1057541430927813, | |
| "rewards/accuracy_reward": 0.13675595503300428, | |
| "rewards/format_reward": 0.0, | |
| "step": 385 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1490.6623809814453, | |
| "epoch": 0.7987711213517665, | |
| "grad_norm": 1.5012719631195068, | |
| "kl": 0.517529296875, | |
| "learning_rate": 3.540057735676388e-07, | |
| "loss": 0.0362, | |
| "reward": 0.13720238325186074, | |
| "reward_std": 0.11000550435855985, | |
| "rewards/accuracy_reward": 0.13720238325186074, | |
| "rewards/format_reward": 0.0, | |
| "step": 390 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1487.6908081054687, | |
| "epoch": 0.8090117767537123, | |
| "grad_norm": 4.934905529022217, | |
| "kl": 0.600732421875, | |
| "learning_rate": 3.20116492715959e-07, | |
| "loss": 0.0425, | |
| "reward": 0.13824405027553438, | |
| "reward_std": 0.10485180700197816, | |
| "rewards/accuracy_reward": 0.13824405027553438, | |
| "rewards/format_reward": 0.0, | |
| "step": 395 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1447.5931762695313, | |
| "epoch": 0.819252432155658, | |
| "grad_norm": 1.9514836072921753, | |
| "kl": 0.618359375, | |
| "learning_rate": 2.8773765196804834e-07, | |
| "loss": 0.0428, | |
| "reward": 0.12217262103222311, | |
| "reward_std": 0.10348854968324303, | |
| "rewards/accuracy_reward": 0.12217262103222311, | |
| "rewards/format_reward": 0.0, | |
| "step": 400 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1489.2579193115234, | |
| "epoch": 0.8294930875576036, | |
| "grad_norm": 4.070176601409912, | |
| "kl": 0.48359375, | |
| "learning_rate": 2.5691070143256447e-07, | |
| "loss": 0.0375, | |
| "reward": 0.13288690745830536, | |
| "reward_std": 0.10663077728822827, | |
| "rewards/accuracy_reward": 0.13288690745830536, | |
| "rewards/format_reward": 0.0, | |
| "step": 405 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1444.0242858886718, | |
| "epoch": 0.8397337429595494, | |
| "grad_norm": 2.602212429046631, | |
| "kl": 0.635986328125, | |
| "learning_rate": 2.276751045498197e-07, | |
| "loss": 0.0414, | |
| "reward": 0.14747024136595427, | |
| "reward_std": 0.10692896069958806, | |
| "rewards/accuracy_reward": 0.14747024136595427, | |
| "rewards/format_reward": 0.0, | |
| "step": 410 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1452.9589599609376, | |
| "epoch": 0.8499743983614951, | |
| "grad_norm": 3.2816104888916016, | |
| "kl": 0.64755859375, | |
| "learning_rate": 2.0006828757224982e-07, | |
| "loss": 0.0374, | |
| "reward": 0.14821428790455685, | |
| "reward_std": 0.10495366291143, | |
| "rewards/accuracy_reward": 0.14821428790455685, | |
| "rewards/format_reward": 0.0, | |
| "step": 415 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1461.2986938476563, | |
| "epoch": 0.8602150537634409, | |
| "grad_norm": 1.5426130294799805, | |
| "kl": 0.415869140625, | |
| "learning_rate": 1.7412559165281155e-07, | |
| "loss": 0.0259, | |
| "reward": 0.1513392882887274, | |
| "reward_std": 0.099711880274117, | |
| "rewards/accuracy_reward": 0.1513392882887274, | |
| "rewards/format_reward": 0.0, | |
| "step": 420 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1480.4108947753907, | |
| "epoch": 0.8704557091653866, | |
| "grad_norm": 1.577873706817627, | |
| "kl": 0.3802734375, | |
| "learning_rate": 1.4988022760263492e-07, | |
| "loss": 0.027, | |
| "reward": 0.12901785969734192, | |
| "reward_std": 0.10641210693866014, | |
| "rewards/accuracy_reward": 0.12901785969734192, | |
| "rewards/format_reward": 0.0, | |
| "step": 425 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1430.3998809814452, | |
| "epoch": 0.8806963645673324, | |
| "grad_norm": 2.2523367404937744, | |
| "kl": 0.503662109375, | |
| "learning_rate": 1.2736323337586142e-07, | |
| "loss": 0.0365, | |
| "reward": 0.13869047933258116, | |
| "reward_std": 0.1244213636033237, | |
| "rewards/accuracy_reward": 0.13869047933258116, | |
| "rewards/format_reward": 0.0, | |
| "step": 430 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1426.7138580322267, | |
| "epoch": 0.890937019969278, | |
| "grad_norm": 4.5399346351623535, | |
| "kl": 0.675048828125, | |
| "learning_rate": 1.0660343433608627e-07, | |
| "loss": 0.0481, | |
| "reward": 0.1412202403997071, | |
| "reward_std": 0.10602434603497386, | |
| "rewards/accuracy_reward": 0.1412202403997071, | |
| "rewards/format_reward": 0.0, | |
| "step": 435 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1438.0576110839843, | |
| "epoch": 0.9011776753712237, | |
| "grad_norm": 5.20224142074585, | |
| "kl": 0.7138671875, | |
| "learning_rate": 8.762740635526756e-08, | |
| "loss": 0.048, | |
| "reward": 0.13943452620878816, | |
| "reward_std": 0.11082206377759576, | |
| "rewards/accuracy_reward": 0.13943452620878816, | |
| "rewards/format_reward": 0.0, | |
| "step": 440 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1476.9360321044921, | |
| "epoch": 0.9114183307731695, | |
| "grad_norm": 2.1484456062316895, | |
| "kl": 0.59794921875, | |
| "learning_rate": 7.04594417923507e-08, | |
| "loss": 0.0401, | |
| "reward": 0.13556547947227954, | |
| "reward_std": 0.10029366509988905, | |
| "rewards/accuracy_reward": 0.13556547947227954, | |
| "rewards/format_reward": 0.0, | |
| "step": 445 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1484.7338073730468, | |
| "epoch": 0.9216589861751152, | |
| "grad_norm": 3.5503857135772705, | |
| "kl": 0.50009765625, | |
| "learning_rate": 5.512151839515234e-08, | |
| "loss": 0.036, | |
| "reward": 0.14583333618938923, | |
| "reward_std": 0.11404564101248979, | |
| "rewards/accuracy_reward": 0.14583333618938923, | |
| "rewards/format_reward": 0.0, | |
| "step": 450 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1441.9092498779296, | |
| "epoch": 0.931899641577061, | |
| "grad_norm": 4.833263397216797, | |
| "kl": 0.46787109375, | |
| "learning_rate": 4.16332711653184e-08, | |
| "loss": 0.0332, | |
| "reward": 0.14017857438884676, | |
| "reward_std": 0.10374454082921147, | |
| "rewards/accuracy_reward": 0.14017857438884676, | |
| "rewards/format_reward": 0.0, | |
| "step": 455 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1402.6604400634765, | |
| "epoch": 0.9421402969790067, | |
| "grad_norm": 2.325770378112793, | |
| "kl": 0.434716796875, | |
| "learning_rate": 3.0011967222373595e-08, | |
| "loss": 0.0299, | |
| "reward": 0.13675595447421074, | |
| "reward_std": 0.1053824592847377, | |
| "rewards/accuracy_reward": 0.13675595447421074, | |
| "rewards/format_reward": 0.0, | |
| "step": 460 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1410.3284545898437, | |
| "epoch": 0.9523809523809523, | |
| "grad_norm": 1.8935805559158325, | |
| "kl": 0.473193359375, | |
| "learning_rate": 2.0272483699039334e-08, | |
| "loss": 0.0327, | |
| "reward": 0.14107143036089836, | |
| "reward_std": 0.11868918919935822, | |
| "rewards/accuracy_reward": 0.14107143036089836, | |
| "rewards/format_reward": 0.0, | |
| "step": 465 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1431.0174346923827, | |
| "epoch": 0.9626216077828981, | |
| "grad_norm": 1.980904459953308, | |
| "kl": 0.47861328125, | |
| "learning_rate": 1.2427288696117589e-08, | |
| "loss": 0.0294, | |
| "reward": 0.14642857415601612, | |
| "reward_std": 0.10726110143586993, | |
| "rewards/accuracy_reward": 0.14642857415601612, | |
| "rewards/format_reward": 0.0, | |
| "step": 470 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1449.0582153320313, | |
| "epoch": 0.9728622631848438, | |
| "grad_norm": 4.871621131896973, | |
| "kl": 0.472021484375, | |
| "learning_rate": 6.486425321322987e-09, | |
| "loss": 0.0256, | |
| "reward": 0.1415178598370403, | |
| "reward_std": 0.10020972546190024, | |
| "rewards/accuracy_reward": 0.1415178598370403, | |
| "rewards/format_reward": 0.0, | |
| "step": 475 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1416.2077575683593, | |
| "epoch": 0.9831029185867896, | |
| "grad_norm": 3.306790590286255, | |
| "kl": 0.481982421875, | |
| "learning_rate": 2.457498832494187e-09, | |
| "loss": 0.0289, | |
| "reward": 0.13943452583625912, | |
| "reward_std": 0.10717600891366601, | |
| "rewards/accuracy_reward": 0.13943452583625912, | |
| "rewards/format_reward": 0.0, | |
| "step": 480 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1434.3193756103515, | |
| "epoch": 0.9933435739887353, | |
| "grad_norm": 3.3276352882385254, | |
| "kl": 0.465087890625, | |
| "learning_rate": 3.456669016438374e-10, | |
| "loss": 0.032, | |
| "reward": 0.13824404957704245, | |
| "reward_std": 0.10434536337852478, | |
| "rewards/accuracy_reward": 0.13824404957704245, | |
| "rewards/format_reward": 0.0, | |
| "step": 485 | |
| }, | |
| { | |
| "clip_ratio": 0.0, | |
| "completion_length": 1441.995569864909, | |
| "epoch": 0.9994879672299027, | |
| "kl": 0.4940592447916667, | |
| "reward": 0.14682539909457168, | |
| "reward_std": 0.11686915632647772, | |
| "rewards/accuracy_reward": 0.14682539909457168, | |
| "rewards/format_reward": 0.0, | |
| "step": 488, | |
| "total_flos": 0.0, | |
| "train_loss": 0.01953804415467455, | |
| "train_runtime": 102141.6698, | |
| "train_samples_per_second": 0.918, | |
| "train_steps_per_second": 0.005 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 488, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 24, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |