| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.2326934264107039, | |
| "eval_steps": 500, | |
| "global_step": 200, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 489.01953125, | |
| "epoch": 0.0011634671320535194, | |
| "grad_norm": 0.4066821416431487, | |
| "kl": 0.0004706382751464844, | |
| "learning_rate": 3.846153846153846e-08, | |
| "loss": 0.0, | |
| "reward": 0.107421875, | |
| "reward_std": 0.14256841503083706, | |
| "rewards/correctness_reward_func": 0.1015625, | |
| "rewards/strict_format_reward_func": 0.005859375, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 491.90234375, | |
| "epoch": 0.002326934264107039, | |
| "grad_norm": 1.2204081989870978, | |
| "kl": 0.0008008480072021484, | |
| "learning_rate": 7.692307692307692e-08, | |
| "loss": 0.0, | |
| "reward": 0.12109375, | |
| "reward_std": 0.22897969186306, | |
| "rewards/correctness_reward_func": 0.1015625, | |
| "rewards/strict_format_reward_func": 0.01953125, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 489.51953125, | |
| "epoch": 0.0034904013961605585, | |
| "grad_norm": 9.972307004201902, | |
| "kl": 0.004039764404296875, | |
| "learning_rate": 1.1538461538461539e-07, | |
| "loss": 0.0002, | |
| "reward": 0.064453125, | |
| "reward_std": 0.125604297965765, | |
| "rewards/correctness_reward_func": 0.0546875, | |
| "rewards/strict_format_reward_func": 0.009765625, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 506.64453125, | |
| "epoch": 0.004653868528214078, | |
| "grad_norm": 1.442368000970251, | |
| "kl": 0.009157180786132812, | |
| "learning_rate": 1.5384615384615385e-07, | |
| "loss": 0.0004, | |
| "reward": 0.09765625, | |
| "reward_std": 0.17736226692795753, | |
| "rewards/correctness_reward_func": 0.0859375, | |
| "rewards/strict_format_reward_func": 0.01171875, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 494.796875, | |
| "epoch": 0.005817335660267597, | |
| "grad_norm": 2.215173709920235, | |
| "kl": 0.004576206207275391, | |
| "learning_rate": 1.9230769230769231e-07, | |
| "loss": 0.0002, | |
| "reward": 0.09375, | |
| "reward_std": 0.1610843911767006, | |
| "rewards/correctness_reward_func": 0.0859375, | |
| "rewards/strict_format_reward_func": 0.0078125, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 500.6875, | |
| "epoch": 0.006980802792321117, | |
| "grad_norm": 1.5436419526429799, | |
| "kl": 0.0007085800170898438, | |
| "learning_rate": 2.3076923076923078e-07, | |
| "loss": 0.0, | |
| "reward": 0.15234375, | |
| "reward_std": 0.24702188372612, | |
| "rewards/correctness_reward_func": 0.140625, | |
| "rewards/strict_format_reward_func": 0.01171875, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 502.09375, | |
| "epoch": 0.008144269924374637, | |
| "grad_norm": 16.600174739059614, | |
| "kl": 0.1050872802734375, | |
| "learning_rate": 2.692307692307692e-07, | |
| "loss": 0.0042, | |
| "reward": 0.126953125, | |
| "reward_std": 0.1962406411767006, | |
| "rewards/correctness_reward_func": 0.1171875, | |
| "rewards/strict_format_reward_func": 0.009765625, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 574.51171875, | |
| "epoch": 0.009307737056428156, | |
| "grad_norm": 151.48079010972526, | |
| "kl": 0.18497467041015625, | |
| "learning_rate": 3.076923076923077e-07, | |
| "loss": 0.0074, | |
| "reward": 0.115234375, | |
| "reward_std": 0.1714012213051319, | |
| "rewards/correctness_reward_func": 0.1015625, | |
| "rewards/strict_format_reward_func": 0.013671875, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 456.30859375, | |
| "epoch": 0.010471204188481676, | |
| "grad_norm": 3.068318566100937, | |
| "kl": 0.007595062255859375, | |
| "learning_rate": 3.461538461538461e-07, | |
| "loss": 0.0003, | |
| "reward": 0.12890625, | |
| "reward_std": 0.20751213282346725, | |
| "rewards/correctness_reward_func": 0.1171875, | |
| "rewards/strict_format_reward_func": 0.01171875, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 467.6171875, | |
| "epoch": 0.011634671320535195, | |
| "grad_norm": 5.54045688921292, | |
| "kl": 0.02019977569580078, | |
| "learning_rate": 3.8461538461538463e-07, | |
| "loss": 0.0008, | |
| "reward": 0.111328125, | |
| "reward_std": 0.2094484455883503, | |
| "rewards/correctness_reward_func": 0.1015625, | |
| "rewards/strict_format_reward_func": 0.009765625, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 522.4609375, | |
| "epoch": 0.012798138452588714, | |
| "grad_norm": 1.6672527087542282, | |
| "kl": 0.007293701171875, | |
| "learning_rate": 4.2307692307692304e-07, | |
| "loss": 0.0003, | |
| "reward": 0.09375, | |
| "reward_std": 0.14160171803086996, | |
| "rewards/correctness_reward_func": 0.078125, | |
| "rewards/strict_format_reward_func": 0.015625, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 554.41015625, | |
| "epoch": 0.013961605584642234, | |
| "grad_norm": 15.713162541614723, | |
| "kl": 0.022916793823242188, | |
| "learning_rate": 4.6153846153846156e-07, | |
| "loss": 0.0009, | |
| "reward": 0.05859375, | |
| "reward_std": 0.09407384321093559, | |
| "rewards/correctness_reward_func": 0.046875, | |
| "rewards/strict_format_reward_func": 0.01171875, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 472.33203125, | |
| "epoch": 0.015125072716695753, | |
| "grad_norm": 9.999083570023721, | |
| "kl": 0.00162506103515625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0001, | |
| "reward": 0.099609375, | |
| "reward_std": 0.16796875, | |
| "rewards/correctness_reward_func": 0.09375, | |
| "rewards/strict_format_reward_func": 0.005859375, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 471.0390625, | |
| "epoch": 0.016288539848749273, | |
| "grad_norm": 14.601550530650117, | |
| "kl": 0.018660545349121094, | |
| "learning_rate": 5.384615384615384e-07, | |
| "loss": 0.0007, | |
| "reward": 0.142578125, | |
| "reward_std": 0.2406984455883503, | |
| "rewards/correctness_reward_func": 0.1328125, | |
| "rewards/strict_format_reward_func": 0.009765625, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 513.76953125, | |
| "epoch": 0.017452006980802792, | |
| "grad_norm": 3.897589890240525, | |
| "kl": 0.0014133453369140625, | |
| "learning_rate": 5.769230769230768e-07, | |
| "loss": 0.0001, | |
| "reward": 0.078125, | |
| "reward_std": 0.13499781489372253, | |
| "rewards/correctness_reward_func": 0.0703125, | |
| "rewards/strict_format_reward_func": 0.0078125, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 461.609375, | |
| "epoch": 0.01861547411285631, | |
| "grad_norm": 0.2331700635519536, | |
| "kl": 0.0008556842803955078, | |
| "learning_rate": 6.153846153846154e-07, | |
| "loss": 0.0, | |
| "reward": 0.080078125, | |
| "reward_std": 0.13233871944248676, | |
| "rewards/correctness_reward_func": 0.0703125, | |
| "rewards/strict_format_reward_func": 0.009765625, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 493.96484375, | |
| "epoch": 0.01977894124490983, | |
| "grad_norm": 0.6332584588781173, | |
| "kl": 0.0008721351623535156, | |
| "learning_rate": 6.538461538461538e-07, | |
| "loss": 0.0, | |
| "reward": 0.109375, | |
| "reward_std": 0.19749781489372253, | |
| "rewards/correctness_reward_func": 0.09375, | |
| "rewards/strict_format_reward_func": 0.015625, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 499.87890625, | |
| "epoch": 0.020942408376963352, | |
| "grad_norm": 6.416190324649871, | |
| "kl": 0.0064716339111328125, | |
| "learning_rate": 6.923076923076922e-07, | |
| "loss": 0.0003, | |
| "reward": 0.0859375, | |
| "reward_std": 0.1454593911767006, | |
| "rewards/correctness_reward_func": 0.078125, | |
| "rewards/strict_format_reward_func": 0.0078125, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 560.52734375, | |
| "epoch": 0.02210587550901687, | |
| "grad_norm": 0.18792971982824808, | |
| "kl": 0.0007505416870117188, | |
| "learning_rate": 7.307692307692307e-07, | |
| "loss": 0.0, | |
| "reward": 0.083984375, | |
| "reward_std": 0.16796875, | |
| "rewards/correctness_reward_func": 0.078125, | |
| "rewards/strict_format_reward_func": 0.005859375, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 475.859375, | |
| "epoch": 0.02326934264107039, | |
| "grad_norm": 57.89397166781348, | |
| "kl": 0.42646121978759766, | |
| "learning_rate": 7.692307692307693e-07, | |
| "loss": 0.0171, | |
| "reward": 0.119140625, | |
| "reward_std": 0.17447129637002945, | |
| "rewards/correctness_reward_func": 0.1015625, | |
| "rewards/strict_format_reward_func": 0.017578125, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 490.015625, | |
| "epoch": 0.02443280977312391, | |
| "grad_norm": 8.434807442056037, | |
| "kl": 0.023256301879882812, | |
| "learning_rate": 8.076923076923077e-07, | |
| "loss": 0.0009, | |
| "reward": 0.1171875, | |
| "reward_std": 0.19881487637758255, | |
| "rewards/correctness_reward_func": 0.09375, | |
| "rewards/strict_format_reward_func": 0.0234375, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 506.33984375, | |
| "epoch": 0.025596276905177427, | |
| "grad_norm": 1.1653419639417089, | |
| "kl": 0.00223541259765625, | |
| "learning_rate": 8.461538461538461e-07, | |
| "loss": 0.0001, | |
| "reward": 0.10546875, | |
| "reward_std": 0.18824483826756477, | |
| "rewards/correctness_reward_func": 0.09375, | |
| "rewards/strict_format_reward_func": 0.01171875, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 511.375, | |
| "epoch": 0.02675974403723095, | |
| "grad_norm": 0.41128365761978974, | |
| "kl": 0.0008344650268554688, | |
| "learning_rate": 8.846153846153846e-07, | |
| "loss": 0.0, | |
| "reward": 0.150390625, | |
| "reward_std": 0.25167298316955566, | |
| "rewards/correctness_reward_func": 0.1328125, | |
| "rewards/strict_format_reward_func": 0.017578125, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 528.53125, | |
| "epoch": 0.027923211169284468, | |
| "grad_norm": 1.4081194117907943, | |
| "kl": 0.0036749839782714844, | |
| "learning_rate": 9.230769230769231e-07, | |
| "loss": 0.0001, | |
| "reward": 0.09765625, | |
| "reward_std": 0.16415445879101753, | |
| "rewards/correctness_reward_func": 0.09375, | |
| "rewards/strict_format_reward_func": 0.00390625, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 566.2265625, | |
| "epoch": 0.029086678301337987, | |
| "grad_norm": 1.1799989005370515, | |
| "kl": 0.009830474853515625, | |
| "learning_rate": 9.615384615384615e-07, | |
| "loss": 0.0004, | |
| "reward": 0.107421875, | |
| "reward_std": 0.16424159705638885, | |
| "rewards/correctness_reward_func": 0.09375, | |
| "rewards/strict_format_reward_func": 0.013671875, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 438.703125, | |
| "epoch": 0.030250145433391506, | |
| "grad_norm": 3.104360216151301, | |
| "kl": 0.03290557861328125, | |
| "learning_rate": 1e-06, | |
| "loss": 0.0013, | |
| "reward": 0.15234375, | |
| "reward_std": 0.20628703013062477, | |
| "rewards/correctness_reward_func": 0.140625, | |
| "rewards/strict_format_reward_func": 0.01171875, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 539.5859375, | |
| "epoch": 0.031413612565445025, | |
| "grad_norm": 0.5338598772393494, | |
| "kl": 0.005832672119140625, | |
| "learning_rate": 9.99996444102478e-07, | |
| "loss": 0.0002, | |
| "reward": 0.1015625, | |
| "reward_std": 0.17196696251630783, | |
| "rewards/correctness_reward_func": 0.0859375, | |
| "rewards/strict_format_reward_func": 0.015625, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 497.140625, | |
| "epoch": 0.03257707969749855, | |
| "grad_norm": 2.5666979824433844, | |
| "kl": 0.018198013305664062, | |
| "learning_rate": 9.999857764604895e-07, | |
| "loss": 0.0007, | |
| "reward": 0.140625, | |
| "reward_std": 0.21521097421646118, | |
| "rewards/correctness_reward_func": 0.125, | |
| "rewards/strict_format_reward_func": 0.015625, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 485.59375, | |
| "epoch": 0.03374054682955206, | |
| "grad_norm": 6.4345252573344816, | |
| "kl": 0.005001068115234375, | |
| "learning_rate": 9.999679972257667e-07, | |
| "loss": 0.0002, | |
| "reward": 0.1875, | |
| "reward_std": 0.2648322060704231, | |
| "rewards/correctness_reward_func": 0.1640625, | |
| "rewards/strict_format_reward_func": 0.0234375, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 416.53125, | |
| "epoch": 0.034904013961605584, | |
| "grad_norm": 2.602336527370174, | |
| "kl": 0.0277099609375, | |
| "learning_rate": 9.999431066511943e-07, | |
| "loss": 0.0011, | |
| "reward": 0.125, | |
| "reward_std": 0.21884196251630783, | |
| "rewards/correctness_reward_func": 0.109375, | |
| "rewards/strict_format_reward_func": 0.015625, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 485.08203125, | |
| "epoch": 0.03606748109365911, | |
| "grad_norm": 1.5342850964012629, | |
| "kl": 0.014495849609375, | |
| "learning_rate": 9.999111050908056e-07, | |
| "loss": 0.0006, | |
| "reward": 0.197265625, | |
| "reward_std": 0.3104500323534012, | |
| "rewards/correctness_reward_func": 0.1796875, | |
| "rewards/strict_format_reward_func": 0.017578125, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 531.640625, | |
| "epoch": 0.03723094822571262, | |
| "grad_norm": 20.359434755302512, | |
| "kl": 0.06761932373046875, | |
| "learning_rate": 9.998719929997773e-07, | |
| "loss": 0.0027, | |
| "reward": 0.142578125, | |
| "reward_std": 0.19669455289840698, | |
| "rewards/correctness_reward_func": 0.1171875, | |
| "rewards/strict_format_reward_func": 0.025390625, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 514.54296875, | |
| "epoch": 0.038394415357766144, | |
| "grad_norm": 0.44887804375426404, | |
| "kl": 0.003627777099609375, | |
| "learning_rate": 9.998257709344243e-07, | |
| "loss": 0.0001, | |
| "reward": 0.173828125, | |
| "reward_std": 0.24332467839121819, | |
| "rewards/correctness_reward_func": 0.171875, | |
| "rewards/strict_format_reward_func": 0.001953125, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 492.8125, | |
| "epoch": 0.03955788248981966, | |
| "grad_norm": 3.197350305028001, | |
| "kl": 0.007907867431640625, | |
| "learning_rate": 9.997724395521901e-07, | |
| "loss": 0.0003, | |
| "reward": 0.14453125, | |
| "reward_std": 0.24139471352100372, | |
| "rewards/correctness_reward_func": 0.1171875, | |
| "rewards/strict_format_reward_func": 0.02734375, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 407.99609375, | |
| "epoch": 0.04072134962187318, | |
| "grad_norm": 0.4960254389285239, | |
| "kl": 0.01204681396484375, | |
| "learning_rate": 9.997119996116382e-07, | |
| "loss": 0.0005, | |
| "reward": 0.240234375, | |
| "reward_std": 0.31166573986411095, | |
| "rewards/correctness_reward_func": 0.2109375, | |
| "rewards/strict_format_reward_func": 0.029296875, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 398.2421875, | |
| "epoch": 0.041884816753926704, | |
| "grad_norm": 9.032088364099023, | |
| "kl": 0.03583526611328125, | |
| "learning_rate": 9.996444519724418e-07, | |
| "loss": 0.0014, | |
| "reward": 0.15625, | |
| "reward_std": 0.23204976320266724, | |
| "rewards/correctness_reward_func": 0.140625, | |
| "rewards/strict_format_reward_func": 0.015625, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 434.36328125, | |
| "epoch": 0.04304828388598022, | |
| "grad_norm": 0.5171450751911104, | |
| "kl": 0.0065155029296875, | |
| "learning_rate": 9.995697975953707e-07, | |
| "loss": 0.0003, | |
| "reward": 0.193359375, | |
| "reward_std": 0.29260406643152237, | |
| "rewards/correctness_reward_func": 0.15625, | |
| "rewards/strict_format_reward_func": 0.037109375, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 488.390625, | |
| "epoch": 0.04421175101803374, | |
| "grad_norm": 0.40237380286765173, | |
| "kl": 0.00611114501953125, | |
| "learning_rate": 9.994880375422784e-07, | |
| "loss": 0.0002, | |
| "reward": 0.26171875, | |
| "reward_std": 0.3653857484459877, | |
| "rewards/correctness_reward_func": 0.203125, | |
| "rewards/strict_format_reward_func": 0.05859375, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 420.58984375, | |
| "epoch": 0.04537521815008726, | |
| "grad_norm": 5.104453757523126, | |
| "kl": 0.01647186279296875, | |
| "learning_rate": 9.99399172976086e-07, | |
| "loss": 0.0007, | |
| "reward": 0.29296875, | |
| "reward_std": 0.37514638155698776, | |
| "rewards/correctness_reward_func": 0.234375, | |
| "rewards/strict_format_reward_func": 0.05859375, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 394.3828125, | |
| "epoch": 0.04653868528214078, | |
| "grad_norm": 3.4950699311546694, | |
| "kl": 0.009918212890625, | |
| "learning_rate": 9.993032051607668e-07, | |
| "loss": 0.0004, | |
| "reward": 0.267578125, | |
| "reward_std": 0.31589680910110474, | |
| "rewards/correctness_reward_func": 0.2265625, | |
| "rewards/strict_format_reward_func": 0.041015625, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 413.97265625, | |
| "epoch": 0.0477021524141943, | |
| "grad_norm": 1.061976991134195, | |
| "kl": 0.0106964111328125, | |
| "learning_rate": 9.992001354613277e-07, | |
| "loss": 0.0004, | |
| "reward": 0.21484375, | |
| "reward_std": 0.2792557002976537, | |
| "rewards/correctness_reward_func": 0.171875, | |
| "rewards/strict_format_reward_func": 0.04296875, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 397.8515625, | |
| "epoch": 0.04886561954624782, | |
| "grad_norm": 0.765028003111525, | |
| "kl": 0.01513671875, | |
| "learning_rate": 9.990899653437901e-07, | |
| "loss": 0.0006, | |
| "reward": 0.388671875, | |
| "reward_std": 0.41248171031475067, | |
| "rewards/correctness_reward_func": 0.3125, | |
| "rewards/strict_format_reward_func": 0.076171875, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 397.07421875, | |
| "epoch": 0.05002908667830134, | |
| "grad_norm": 12.065798447865369, | |
| "kl": 0.06500244140625, | |
| "learning_rate": 9.989726963751682e-07, | |
| "loss": 0.0026, | |
| "reward": 0.3125, | |
| "reward_std": 0.4369198568165302, | |
| "rewards/correctness_reward_func": 0.234375, | |
| "rewards/strict_format_reward_func": 0.078125, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 384.0859375, | |
| "epoch": 0.051192553810354854, | |
| "grad_norm": 10.020233386348186, | |
| "kl": 0.1649932861328125, | |
| "learning_rate": 9.988483302234478e-07, | |
| "loss": 0.0066, | |
| "reward": 0.275390625, | |
| "reward_std": 0.3524422347545624, | |
| "rewards/correctness_reward_func": 0.1796875, | |
| "rewards/strict_format_reward_func": 0.095703125, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 424.24609375, | |
| "epoch": 0.05235602094240838, | |
| "grad_norm": 3809.223926800563, | |
| "kl": 24.387359619140625, | |
| "learning_rate": 9.987168686575623e-07, | |
| "loss": 0.9774, | |
| "reward": 0.26953125, | |
| "reward_std": 0.3424443453550339, | |
| "rewards/correctness_reward_func": 0.1796875, | |
| "rewards/strict_format_reward_func": 0.08984375, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 365.3359375, | |
| "epoch": 0.0535194880744619, | |
| "grad_norm": 4.815371512573417, | |
| "kl": 0.0827484130859375, | |
| "learning_rate": 9.98578313547367e-07, | |
| "loss": 0.0033, | |
| "reward": 0.39453125, | |
| "reward_std": 0.45618152618408203, | |
| "rewards/correctness_reward_func": 0.2890625, | |
| "rewards/strict_format_reward_func": 0.10546875, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 410.48828125, | |
| "epoch": 0.054682955206515414, | |
| "grad_norm": 3.1518114378957223, | |
| "kl": 0.03973388671875, | |
| "learning_rate": 9.98432666863613e-07, | |
| "loss": 0.0016, | |
| "reward": 0.365234375, | |
| "reward_std": 0.4011538214981556, | |
| "rewards/correctness_reward_func": 0.234375, | |
| "rewards/strict_format_reward_func": 0.130859375, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 409.66796875, | |
| "epoch": 0.055846422338568937, | |
| "grad_norm": 0.6202311602466106, | |
| "kl": 0.025848388671875, | |
| "learning_rate": 9.982799306779189e-07, | |
| "loss": 0.001, | |
| "reward": 0.591796875, | |
| "reward_std": 0.5869772136211395, | |
| "rewards/correctness_reward_func": 0.421875, | |
| "rewards/strict_format_reward_func": 0.169921875, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 375.16015625, | |
| "epoch": 0.05700988947062245, | |
| "grad_norm": 231.68110012914804, | |
| "kl": 0.854034423828125, | |
| "learning_rate": 9.98120107162742e-07, | |
| "loss": 0.0343, | |
| "reward": 0.42578125, | |
| "reward_std": 0.4635503552854061, | |
| "rewards/correctness_reward_func": 0.2421875, | |
| "rewards/strict_format_reward_func": 0.18359375, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 396.5, | |
| "epoch": 0.058173356602675974, | |
| "grad_norm": 8.054334866966464, | |
| "kl": 0.02935791015625, | |
| "learning_rate": 9.979531985913457e-07, | |
| "loss": 0.0012, | |
| "reward": 0.43359375, | |
| "reward_std": 0.5311293751001358, | |
| "rewards/correctness_reward_func": 0.2421875, | |
| "rewards/strict_format_reward_func": 0.19140625, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 341.55859375, | |
| "epoch": 0.059336823734729496, | |
| "grad_norm": 39.40136884647299, | |
| "kl": 0.515869140625, | |
| "learning_rate": 9.977792073377697e-07, | |
| "loss": 0.0206, | |
| "reward": 0.50390625, | |
| "reward_std": 0.5089018940925598, | |
| "rewards/correctness_reward_func": 0.3203125, | |
| "rewards/strict_format_reward_func": 0.18359375, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 375.640625, | |
| "epoch": 0.06050029086678301, | |
| "grad_norm": 20.902673745049487, | |
| "kl": 0.14080810546875, | |
| "learning_rate": 9.975981358767944e-07, | |
| "loss": 0.0056, | |
| "reward": 0.453125, | |
| "reward_std": 0.47038574516773224, | |
| "rewards/correctness_reward_func": 0.2578125, | |
| "rewards/strict_format_reward_func": 0.1953125, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 351.328125, | |
| "epoch": 0.061663757998836534, | |
| "grad_norm": 2.0754249823099595, | |
| "kl": 0.0267333984375, | |
| "learning_rate": 9.974099867839057e-07, | |
| "loss": 0.0011, | |
| "reward": 0.525390625, | |
| "reward_std": 0.46075718849897385, | |
| "rewards/correctness_reward_func": 0.2890625, | |
| "rewards/strict_format_reward_func": 0.236328125, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 385.4140625, | |
| "epoch": 0.06282722513089005, | |
| "grad_norm": 335.2509946112828, | |
| "kl": 1.28887939453125, | |
| "learning_rate": 9.972147627352593e-07, | |
| "loss": 0.0513, | |
| "reward": 0.533203125, | |
| "reward_std": 0.4158325716853142, | |
| "rewards/correctness_reward_func": 0.3125, | |
| "rewards/strict_format_reward_func": 0.220703125, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 315.51953125, | |
| "epoch": 0.06399069226294357, | |
| "grad_norm": 113.13047615199778, | |
| "kl": 0.79083251953125, | |
| "learning_rate": 9.970124665076417e-07, | |
| "loss": 0.0317, | |
| "reward": 0.5625, | |
| "reward_std": 0.5517344921827316, | |
| "rewards/correctness_reward_func": 0.3203125, | |
| "rewards/strict_format_reward_func": 0.2421875, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 313.81640625, | |
| "epoch": 0.0651541593949971, | |
| "grad_norm": 1.1795690608203528, | |
| "kl": 0.047210693359375, | |
| "learning_rate": 9.96803100978432e-07, | |
| "loss": 0.0019, | |
| "reward": 0.642578125, | |
| "reward_std": 0.47613072395324707, | |
| "rewards/correctness_reward_func": 0.34375, | |
| "rewards/strict_format_reward_func": 0.298828125, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 348.234375, | |
| "epoch": 0.06631762652705062, | |
| "grad_norm": 13.371406300782917, | |
| "kl": 0.11224365234375, | |
| "learning_rate": 9.965866691255597e-07, | |
| "loss": 0.0045, | |
| "reward": 0.66796875, | |
| "reward_std": 0.59251669049263, | |
| "rewards/correctness_reward_func": 0.390625, | |
| "rewards/strict_format_reward_func": 0.27734375, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 383.1796875, | |
| "epoch": 0.06748109365910412, | |
| "grad_norm": 1.6177251711954495, | |
| "kl": 0.030792236328125, | |
| "learning_rate": 9.963631740274622e-07, | |
| "loss": 0.0012, | |
| "reward": 0.5625, | |
| "reward_std": 0.47907302528619766, | |
| "rewards/correctness_reward_func": 0.2890625, | |
| "rewards/strict_format_reward_func": 0.2734375, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 355.9921875, | |
| "epoch": 0.06864456079115765, | |
| "grad_norm": 47.382528946856574, | |
| "kl": 0.235626220703125, | |
| "learning_rate": 9.961326188630425e-07, | |
| "loss": 0.0095, | |
| "reward": 0.607421875, | |
| "reward_std": 0.535815954208374, | |
| "rewards/correctness_reward_func": 0.34375, | |
| "rewards/strict_format_reward_func": 0.263671875, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 377.46875, | |
| "epoch": 0.06980802792321117, | |
| "grad_norm": 2.8906662162333294, | |
| "kl": 0.076416015625, | |
| "learning_rate": 9.95895006911623e-07, | |
| "loss": 0.0031, | |
| "reward": 0.6015625, | |
| "reward_std": 0.4677914008498192, | |
| "rewards/correctness_reward_func": 0.3046875, | |
| "rewards/strict_format_reward_func": 0.296875, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 307.0859375, | |
| "epoch": 0.07097149505526469, | |
| "grad_norm": 5.658248453671423, | |
| "kl": 0.06695556640625, | |
| "learning_rate": 9.956503415528982e-07, | |
| "loss": 0.0027, | |
| "reward": 0.60546875, | |
| "reward_std": 0.5141743049025536, | |
| "rewards/correctness_reward_func": 0.2734375, | |
| "rewards/strict_format_reward_func": 0.33203125, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 308.98046875, | |
| "epoch": 0.07213496218731821, | |
| "grad_norm": 4.0281675399539, | |
| "kl": 0.06524658203125, | |
| "learning_rate": 9.953986262668884e-07, | |
| "loss": 0.0026, | |
| "reward": 0.66015625, | |
| "reward_std": 0.4833526313304901, | |
| "rewards/correctness_reward_func": 0.28125, | |
| "rewards/strict_format_reward_func": 0.37890625, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 251.5625, | |
| "epoch": 0.07329842931937172, | |
| "grad_norm": 6.4708893162037455, | |
| "kl": 0.13079833984375, | |
| "learning_rate": 9.951398646338883e-07, | |
| "loss": 0.0052, | |
| "reward": 0.720703125, | |
| "reward_std": 0.4117320328950882, | |
| "rewards/correctness_reward_func": 0.3515625, | |
| "rewards/strict_format_reward_func": 0.369140625, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 299.58203125, | |
| "epoch": 0.07446189645142524, | |
| "grad_norm": 13.45087311669488, | |
| "kl": 0.15289306640625, | |
| "learning_rate": 9.948740603344172e-07, | |
| "loss": 0.0061, | |
| "reward": 0.81640625, | |
| "reward_std": 0.6002717912197113, | |
| "rewards/correctness_reward_func": 0.4765625, | |
| "rewards/strict_format_reward_func": 0.33984375, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 302.1953125, | |
| "epoch": 0.07562536358347877, | |
| "grad_norm": 18.560697276437615, | |
| "kl": 0.20904541015625, | |
| "learning_rate": 9.946012171491668e-07, | |
| "loss": 0.0083, | |
| "reward": 0.796875, | |
| "reward_std": 0.4715307354927063, | |
| "rewards/correctness_reward_func": 0.4140625, | |
| "rewards/strict_format_reward_func": 0.3828125, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 286.73828125, | |
| "epoch": 0.07678883071553229, | |
| "grad_norm": 14.086886383004378, | |
| "kl": 0.04949951171875, | |
| "learning_rate": 9.943213389589466e-07, | |
| "loss": 0.002, | |
| "reward": 0.82421875, | |
| "reward_std": 0.5994473099708557, | |
| "rewards/correctness_reward_func": 0.4453125, | |
| "rewards/strict_format_reward_func": 0.37890625, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 323.93359375, | |
| "epoch": 0.07795229784758581, | |
| "grad_norm": 19.144971581802082, | |
| "kl": 0.1656494140625, | |
| "learning_rate": 9.940344297446292e-07, | |
| "loss": 0.0066, | |
| "reward": 0.73828125, | |
| "reward_std": 0.43859150260686874, | |
| "rewards/correctness_reward_func": 0.3671875, | |
| "rewards/strict_format_reward_func": 0.37109375, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 350.01171875, | |
| "epoch": 0.07911576497963932, | |
| "grad_norm": 77.24678279515392, | |
| "kl": 0.53643798828125, | |
| "learning_rate": 9.937404935870937e-07, | |
| "loss": 0.0215, | |
| "reward": 0.61328125, | |
| "reward_std": 0.4140402674674988, | |
| "rewards/correctness_reward_func": 0.21875, | |
| "rewards/strict_format_reward_func": 0.39453125, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 305.87109375, | |
| "epoch": 0.08027923211169284, | |
| "grad_norm": 12.883736811037531, | |
| "kl": 0.25457763671875, | |
| "learning_rate": 9.934395346671673e-07, | |
| "loss": 0.0102, | |
| "reward": 0.705078125, | |
| "reward_std": 0.48615749180316925, | |
| "rewards/correctness_reward_func": 0.3125, | |
| "rewards/strict_format_reward_func": 0.392578125, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 279.07421875, | |
| "epoch": 0.08144269924374636, | |
| "grad_norm": 4.8084774229944705, | |
| "kl": 0.158447265625, | |
| "learning_rate": 9.93131557265567e-07, | |
| "loss": 0.0063, | |
| "reward": 0.775390625, | |
| "reward_std": 0.43615715205669403, | |
| "rewards/correctness_reward_func": 0.390625, | |
| "rewards/strict_format_reward_func": 0.384765625, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 271.90625, | |
| "epoch": 0.08260616637579989, | |
| "grad_norm": 209.3718112774744, | |
| "kl": 1.3720703125, | |
| "learning_rate": 9.928165657628363e-07, | |
| "loss": 0.0552, | |
| "reward": 0.771484375, | |
| "reward_std": 0.4666217863559723, | |
| "rewards/correctness_reward_func": 0.375, | |
| "rewards/strict_format_reward_func": 0.396484375, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 280.234375, | |
| "epoch": 0.08376963350785341, | |
| "grad_norm": 8.675435114113355, | |
| "kl": 0.08306884765625, | |
| "learning_rate": 9.924945646392856e-07, | |
| "loss": 0.0033, | |
| "reward": 0.724609375, | |
| "reward_std": 0.4121067523956299, | |
| "rewards/correctness_reward_func": 0.328125, | |
| "rewards/strict_format_reward_func": 0.396484375, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 298.13671875, | |
| "epoch": 0.08493310063990692, | |
| "grad_norm": 2.261657798977972, | |
| "kl": 0.05487060546875, | |
| "learning_rate": 9.92165558474927e-07, | |
| "loss": 0.0022, | |
| "reward": 0.79296875, | |
| "reward_std": 0.4348938390612602, | |
| "rewards/correctness_reward_func": 0.40625, | |
| "rewards/strict_format_reward_func": 0.38671875, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 275.25390625, | |
| "epoch": 0.08609656777196044, | |
| "grad_norm": 306.8537482842207, | |
| "kl": 2.76251220703125, | |
| "learning_rate": 9.918295519494089e-07, | |
| "loss": 0.1104, | |
| "reward": 0.853515625, | |
| "reward_std": 0.4975513890385628, | |
| "rewards/correctness_reward_func": 0.4375, | |
| "rewards/strict_format_reward_func": 0.416015625, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 303.42578125, | |
| "epoch": 0.08726003490401396, | |
| "grad_norm": 11.231963005062614, | |
| "kl": 0.20123291015625, | |
| "learning_rate": 9.91486549841951e-07, | |
| "loss": 0.008, | |
| "reward": 0.720703125, | |
| "reward_std": 0.46410517394542694, | |
| "rewards/correctness_reward_func": 0.3359375, | |
| "rewards/strict_format_reward_func": 0.384765625, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 277.7578125, | |
| "epoch": 0.08842350203606748, | |
| "grad_norm": 160.47901765005733, | |
| "kl": 1.1749267578125, | |
| "learning_rate": 9.91136557031274e-07, | |
| "loss": 0.0472, | |
| "reward": 0.74609375, | |
| "reward_std": 0.4058147594332695, | |
| "rewards/correctness_reward_func": 0.3125, | |
| "rewards/strict_format_reward_func": 0.43359375, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 340.8515625, | |
| "epoch": 0.089586969168121, | |
| "grad_norm": 381.85614779047495, | |
| "kl": 1.22967529296875, | |
| "learning_rate": 9.907795784955326e-07, | |
| "loss": 0.0492, | |
| "reward": 0.765625, | |
| "reward_std": 0.42379553616046906, | |
| "rewards/correctness_reward_func": 0.3515625, | |
| "rewards/strict_format_reward_func": 0.4140625, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 274.44921875, | |
| "epoch": 0.09075043630017451, | |
| "grad_norm": 64.47137880223688, | |
| "kl": 0.67572021484375, | |
| "learning_rate": 9.904156193122431e-07, | |
| "loss": 0.027, | |
| "reward": 0.80078125, | |
| "reward_std": 0.4361772760748863, | |
| "rewards/correctness_reward_func": 0.3828125, | |
| "rewards/strict_format_reward_func": 0.41796875, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 265.8359375, | |
| "epoch": 0.09191390343222804, | |
| "grad_norm": 27.903843212195564, | |
| "kl": 0.46124267578125, | |
| "learning_rate": 9.900446846582119e-07, | |
| "loss": 0.0185, | |
| "reward": 0.92578125, | |
| "reward_std": 0.5412914976477623, | |
| "rewards/correctness_reward_func": 0.5078125, | |
| "rewards/strict_format_reward_func": 0.41796875, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 320.8984375, | |
| "epoch": 0.09307737056428156, | |
| "grad_norm": 1.9761659190639995, | |
| "kl": 0.04864501953125, | |
| "learning_rate": 9.896667798094608e-07, | |
| "loss": 0.0019, | |
| "reward": 0.76953125, | |
| "reward_std": 0.4052440747618675, | |
| "rewards/correctness_reward_func": 0.375, | |
| "rewards/strict_format_reward_func": 0.39453125, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 312.84765625, | |
| "epoch": 0.09424083769633508, | |
| "grad_norm": 57.910687963151915, | |
| "kl": 0.12591552734375, | |
| "learning_rate": 9.892819101411543e-07, | |
| "loss": 0.005, | |
| "reward": 0.75, | |
| "reward_std": 0.49195902049541473, | |
| "rewards/correctness_reward_func": 0.3359375, | |
| "rewards/strict_format_reward_func": 0.4140625, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 271.83984375, | |
| "epoch": 0.0954043048283886, | |
| "grad_norm": 5.41836244806774, | |
| "kl": 0.08587646484375, | |
| "learning_rate": 9.888900811275203e-07, | |
| "loss": 0.0034, | |
| "reward": 0.71484375, | |
| "reward_std": 0.43381768465042114, | |
| "rewards/correctness_reward_func": 0.2890625, | |
| "rewards/strict_format_reward_func": 0.42578125, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 290.12109375, | |
| "epoch": 0.09656777196044211, | |
| "grad_norm": 0.8646498967374704, | |
| "kl": 0.0540771484375, | |
| "learning_rate": 9.884912983417743e-07, | |
| "loss": 0.0022, | |
| "reward": 0.892578125, | |
| "reward_std": 0.47797150164842606, | |
| "rewards/correctness_reward_func": 0.46875, | |
| "rewards/strict_format_reward_func": 0.423828125, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 224.84375, | |
| "epoch": 0.09773123909249563, | |
| "grad_norm": 2.249327867839398, | |
| "kl": 0.0767822265625, | |
| "learning_rate": 9.88085567456039e-07, | |
| "loss": 0.0031, | |
| "reward": 0.927734375, | |
| "reward_std": 0.5308554023504257, | |
| "rewards/correctness_reward_func": 0.4765625, | |
| "rewards/strict_format_reward_func": 0.451171875, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 268.13671875, | |
| "epoch": 0.09889470622454916, | |
| "grad_norm": 5.4143471770769445, | |
| "kl": 0.10394287109375, | |
| "learning_rate": 9.876728942412642e-07, | |
| "loss": 0.0042, | |
| "reward": 0.849609375, | |
| "reward_std": 0.44555214792490005, | |
| "rewards/correctness_reward_func": 0.4140625, | |
| "rewards/strict_format_reward_func": 0.435546875, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 310.32421875, | |
| "epoch": 0.10005817335660268, | |
| "grad_norm": 99.7419314506415, | |
| "kl": 0.43487548828125, | |
| "learning_rate": 9.872532845671449e-07, | |
| "loss": 0.0174, | |
| "reward": 0.619140625, | |
| "reward_std": 0.2979493774473667, | |
| "rewards/correctness_reward_func": 0.203125, | |
| "rewards/strict_format_reward_func": 0.416015625, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 336.62109375, | |
| "epoch": 0.1012216404886562, | |
| "grad_norm": 2.7658118091947093, | |
| "kl": 0.0557861328125, | |
| "learning_rate": 9.868267444020366e-07, | |
| "loss": 0.0022, | |
| "reward": 0.84375, | |
| "reward_std": 0.4424229711294174, | |
| "rewards/correctness_reward_func": 0.4140625, | |
| "rewards/strict_format_reward_func": 0.4296875, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 267.109375, | |
| "epoch": 0.10238510762070971, | |
| "grad_norm": 1.3296554071596351, | |
| "kl": 0.05279541015625, | |
| "learning_rate": 9.86393279812872e-07, | |
| "loss": 0.0021, | |
| "reward": 0.763671875, | |
| "reward_std": 0.40575000643730164, | |
| "rewards/correctness_reward_func": 0.3203125, | |
| "rewards/strict_format_reward_func": 0.443359375, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 286.4453125, | |
| "epoch": 0.10354857475276323, | |
| "grad_norm": 1.9800959128263669, | |
| "kl": 0.0650634765625, | |
| "learning_rate": 9.859528969650737e-07, | |
| "loss": 0.0026, | |
| "reward": 0.70703125, | |
| "reward_std": 0.36570068448781967, | |
| "rewards/correctness_reward_func": 0.2734375, | |
| "rewards/strict_format_reward_func": 0.43359375, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 315.234375, | |
| "epoch": 0.10471204188481675, | |
| "grad_norm": 0.9855921470718585, | |
| "kl": 0.057861328125, | |
| "learning_rate": 9.855056021224671e-07, | |
| "loss": 0.0023, | |
| "reward": 0.78125, | |
| "reward_std": 0.38219955191016197, | |
| "rewards/correctness_reward_func": 0.359375, | |
| "rewards/strict_format_reward_func": 0.421875, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 283.63671875, | |
| "epoch": 0.10587550901687028, | |
| "grad_norm": 2.2979566856097633, | |
| "kl": 0.05584716796875, | |
| "learning_rate": 9.850514016471902e-07, | |
| "loss": 0.0022, | |
| "reward": 0.814453125, | |
| "reward_std": 0.4690292477607727, | |
| "rewards/correctness_reward_func": 0.375, | |
| "rewards/strict_format_reward_func": 0.439453125, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 280.4921875, | |
| "epoch": 0.1070389761489238, | |
| "grad_norm": 54.88062525909732, | |
| "kl": 0.2271728515625, | |
| "learning_rate": 9.845903019996045e-07, | |
| "loss": 0.0091, | |
| "reward": 0.732421875, | |
| "reward_std": 0.3464353382587433, | |
| "rewards/correctness_reward_func": 0.296875, | |
| "rewards/strict_format_reward_func": 0.435546875, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 257.05078125, | |
| "epoch": 0.1082024432809773, | |
| "grad_norm": 2.4355438627389714, | |
| "kl": 0.16082763671875, | |
| "learning_rate": 9.841223097382027e-07, | |
| "loss": 0.0065, | |
| "reward": 0.921875, | |
| "reward_std": 0.5120889246463776, | |
| "rewards/correctness_reward_func": 0.484375, | |
| "rewards/strict_format_reward_func": 0.4375, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 330.67578125, | |
| "epoch": 0.10936591041303083, | |
| "grad_norm": 0.29946740274215317, | |
| "kl": 0.04864501953125, | |
| "learning_rate": 9.836474315195147e-07, | |
| "loss": 0.0019, | |
| "reward": 0.734375, | |
| "reward_std": 0.4280121922492981, | |
| "rewards/correctness_reward_func": 0.3125, | |
| "rewards/strict_format_reward_func": 0.421875, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 293.5234375, | |
| "epoch": 0.11052937754508435, | |
| "grad_norm": 0.3362629809809694, | |
| "kl": 0.045654296875, | |
| "learning_rate": 9.831656740980135e-07, | |
| "loss": 0.0018, | |
| "reward": 0.716796875, | |
| "reward_std": 0.4403715208172798, | |
| "rewards/correctness_reward_func": 0.296875, | |
| "rewards/strict_format_reward_func": 0.419921875, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 237.234375, | |
| "epoch": 0.11169284467713787, | |
| "grad_norm": 0.3082617927162893, | |
| "kl": 0.05224609375, | |
| "learning_rate": 9.826770443260193e-07, | |
| "loss": 0.0021, | |
| "reward": 0.818359375, | |
| "reward_std": 0.431897908449173, | |
| "rewards/correctness_reward_func": 0.3828125, | |
| "rewards/strict_format_reward_func": 0.435546875, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 272.08984375, | |
| "epoch": 0.1128563118091914, | |
| "grad_norm": 3.297879330447764, | |
| "kl": 0.06298828125, | |
| "learning_rate": 9.821815491536016e-07, | |
| "loss": 0.0025, | |
| "reward": 0.86328125, | |
| "reward_std": 0.4964246600866318, | |
| "rewards/correctness_reward_func": 0.4296875, | |
| "rewards/strict_format_reward_func": 0.43359375, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 313.859375, | |
| "epoch": 0.1140197789412449, | |
| "grad_norm": 1.7917369827084497, | |
| "kl": 0.05108642578125, | |
| "learning_rate": 9.81679195628481e-07, | |
| "loss": 0.002, | |
| "reward": 0.69140625, | |
| "reward_std": 0.4303680807352066, | |
| "rewards/correctness_reward_func": 0.265625, | |
| "rewards/strict_format_reward_func": 0.42578125, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 234.34375, | |
| "epoch": 0.11518324607329843, | |
| "grad_norm": 2.373852143479517, | |
| "kl": 0.07373046875, | |
| "learning_rate": 9.811699908959275e-07, | |
| "loss": 0.0029, | |
| "reward": 0.927734375, | |
| "reward_std": 0.4275398887693882, | |
| "rewards/correctness_reward_func": 0.4765625, | |
| "rewards/strict_format_reward_func": 0.451171875, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 267.76953125, | |
| "epoch": 0.11634671320535195, | |
| "grad_norm": 1.5428375901598974, | |
| "kl": 0.05035400390625, | |
| "learning_rate": 9.806539421986608e-07, | |
| "loss": 0.002, | |
| "reward": 0.763671875, | |
| "reward_std": 0.45994649082422256, | |
| "rewards/correctness_reward_func": 0.3203125, | |
| "rewards/strict_format_reward_func": 0.443359375, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 245.25390625, | |
| "epoch": 0.11751018033740547, | |
| "grad_norm": 2.3658036123831083, | |
| "kl": 0.04681396484375, | |
| "learning_rate": 9.80131056876746e-07, | |
| "loss": 0.0019, | |
| "reward": 0.8828125, | |
| "reward_std": 0.43228569626808167, | |
| "rewards/correctness_reward_func": 0.4375, | |
| "rewards/strict_format_reward_func": 0.4453125, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 229.6953125, | |
| "epoch": 0.11867364746945899, | |
| "grad_norm": 7.873285555480436, | |
| "kl": 0.05926513671875, | |
| "learning_rate": 9.796013423674898e-07, | |
| "loss": 0.0024, | |
| "reward": 0.75, | |
| "reward_std": 0.3967752233147621, | |
| "rewards/correctness_reward_func": 0.328125, | |
| "rewards/strict_format_reward_func": 0.421875, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 344.07421875, | |
| "epoch": 0.1198371146015125, | |
| "grad_norm": 15.56401233276747, | |
| "kl": 0.1546630859375, | |
| "learning_rate": 9.79064806205334e-07, | |
| "loss": 0.0062, | |
| "reward": 0.810546875, | |
| "reward_std": 0.44128578901290894, | |
| "rewards/correctness_reward_func": 0.3828125, | |
| "rewards/strict_format_reward_func": 0.427734375, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 282.15625, | |
| "epoch": 0.12100058173356602, | |
| "grad_norm": 5.111153394673512, | |
| "kl": 0.0589599609375, | |
| "learning_rate": 9.78521456021749e-07, | |
| "loss": 0.0024, | |
| "reward": 0.80078125, | |
| "reward_std": 0.48010821640491486, | |
| "rewards/correctness_reward_func": 0.3671875, | |
| "rewards/strict_format_reward_func": 0.43359375, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 242.20703125, | |
| "epoch": 0.12216404886561955, | |
| "grad_norm": 6.003504446851319, | |
| "kl": 0.05206298828125, | |
| "learning_rate": 9.779712995451252e-07, | |
| "loss": 0.0021, | |
| "reward": 0.80859375, | |
| "reward_std": 0.443262055516243, | |
| "rewards/correctness_reward_func": 0.375, | |
| "rewards/strict_format_reward_func": 0.43359375, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 276.546875, | |
| "epoch": 0.12332751599767307, | |
| "grad_norm": 4.073486002796206, | |
| "kl": 0.054443359375, | |
| "learning_rate": 9.77414344600663e-07, | |
| "loss": 0.0022, | |
| "reward": 0.767578125, | |
| "reward_std": 0.46208247542381287, | |
| "rewards/correctness_reward_func": 0.3359375, | |
| "rewards/strict_format_reward_func": 0.431640625, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 257.03515625, | |
| "epoch": 0.12449098312972659, | |
| "grad_norm": 18.5627783152072, | |
| "kl": 0.087646484375, | |
| "learning_rate": 9.76850599110261e-07, | |
| "loss": 0.0035, | |
| "reward": 0.794921875, | |
| "reward_std": 0.4035182222723961, | |
| "rewards/correctness_reward_func": 0.3671875, | |
| "rewards/strict_format_reward_func": 0.427734375, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 312.40625, | |
| "epoch": 0.1256544502617801, | |
| "grad_norm": 6.982791090576697, | |
| "kl": 0.115966796875, | |
| "learning_rate": 9.762800710924038e-07, | |
| "loss": 0.0046, | |
| "reward": 0.783203125, | |
| "reward_std": 0.5103202238678932, | |
| "rewards/correctness_reward_func": 0.375, | |
| "rewards/strict_format_reward_func": 0.408203125, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 266.125, | |
| "epoch": 0.12681791739383363, | |
| "grad_norm": 1.474075928860927, | |
| "kl": 0.04534912109375, | |
| "learning_rate": 9.75702768662048e-07, | |
| "loss": 0.0018, | |
| "reward": 0.828125, | |
| "reward_std": 0.398033931851387, | |
| "rewards/correctness_reward_func": 0.390625, | |
| "rewards/strict_format_reward_func": 0.4375, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 291.98828125, | |
| "epoch": 0.12798138452588714, | |
| "grad_norm": 1082.0566595703972, | |
| "kl": 2.355712890625, | |
| "learning_rate": 9.751187000305074e-07, | |
| "loss": 0.0937, | |
| "reward": 0.890625, | |
| "reward_std": 0.4832841530442238, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "rewards/strict_format_reward_func": 0.4296875, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 257.46484375, | |
| "epoch": 0.12914485165794065, | |
| "grad_norm": 8.260881771838365, | |
| "kl": 0.305908203125, | |
| "learning_rate": 9.745278735053343e-07, | |
| "loss": 0.0122, | |
| "reward": 0.791015625, | |
| "reward_std": 0.5271246433258057, | |
| "rewards/correctness_reward_func": 0.3671875, | |
| "rewards/strict_format_reward_func": 0.423828125, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 278.26171875, | |
| "epoch": 0.1303083187899942, | |
| "grad_norm": 7.127367987174519, | |
| "kl": 0.2362060546875, | |
| "learning_rate": 9.73930297490203e-07, | |
| "loss": 0.0094, | |
| "reward": 0.650390625, | |
| "reward_std": 0.3026326783001423, | |
| "rewards/correctness_reward_func": 0.2421875, | |
| "rewards/strict_format_reward_func": 0.408203125, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 264.4765625, | |
| "epoch": 0.1314717859220477, | |
| "grad_norm": 5.665522812893531, | |
| "kl": 0.190673828125, | |
| "learning_rate": 9.7332598048479e-07, | |
| "loss": 0.0076, | |
| "reward": 0.966796875, | |
| "reward_std": 0.4744589924812317, | |
| "rewards/correctness_reward_func": 0.5390625, | |
| "rewards/strict_format_reward_func": 0.427734375, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 313.296875, | |
| "epoch": 0.13263525305410123, | |
| "grad_norm": 0.6167995629967288, | |
| "kl": 0.0430908203125, | |
| "learning_rate": 9.727149310846523e-07, | |
| "loss": 0.0017, | |
| "reward": 0.662109375, | |
| "reward_std": 0.4723682776093483, | |
| "rewards/correctness_reward_func": 0.25, | |
| "rewards/strict_format_reward_func": 0.412109375, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 238.8828125, | |
| "epoch": 0.13379872018615474, | |
| "grad_norm": 5932.568937857461, | |
| "kl": 31.500244140625, | |
| "learning_rate": 9.720971579811065e-07, | |
| "loss": 1.2647, | |
| "reward": 0.939453125, | |
| "reward_std": 0.4201255813241005, | |
| "rewards/correctness_reward_func": 0.4921875, | |
| "rewards/strict_format_reward_func": 0.447265625, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 270.9375, | |
| "epoch": 0.13496218731820825, | |
| "grad_norm": 2.0818128486642893, | |
| "kl": 0.11199951171875, | |
| "learning_rate": 9.714726699611037e-07, | |
| "loss": 0.0045, | |
| "reward": 0.6328125, | |
| "reward_std": 0.34041667729616165, | |
| "rewards/correctness_reward_func": 0.234375, | |
| "rewards/strict_format_reward_func": 0.3984375, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 244.61328125, | |
| "epoch": 0.13612565445026178, | |
| "grad_norm": 29.72218794430255, | |
| "kl": 0.26605224609375, | |
| "learning_rate": 9.708414759071057e-07, | |
| "loss": 0.0106, | |
| "reward": 0.853515625, | |
| "reward_std": 0.4693729430437088, | |
| "rewards/correctness_reward_func": 0.4140625, | |
| "rewards/strict_format_reward_func": 0.439453125, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 269.69140625, | |
| "epoch": 0.1372891215823153, | |
| "grad_norm": 9.459551717392767, | |
| "kl": 0.16796875, | |
| "learning_rate": 9.702035847969578e-07, | |
| "loss": 0.0067, | |
| "reward": 0.802734375, | |
| "reward_std": 0.45168111473321915, | |
| "rewards/correctness_reward_func": 0.3671875, | |
| "rewards/strict_format_reward_func": 0.435546875, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 263.65625, | |
| "epoch": 0.13845258871436883, | |
| "grad_norm": 3.6379354568837274, | |
| "kl": 0.19122314453125, | |
| "learning_rate": 9.695590057037618e-07, | |
| "loss": 0.0077, | |
| "reward": 0.71875, | |
| "reward_std": 0.4168992340564728, | |
| "rewards/correctness_reward_func": 0.2890625, | |
| "rewards/strict_format_reward_func": 0.4296875, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 255.9921875, | |
| "epoch": 0.13961605584642234, | |
| "grad_norm": 6.956294376892292, | |
| "kl": 0.4310302734375, | |
| "learning_rate": 9.689077477957468e-07, | |
| "loss": 0.0172, | |
| "reward": 0.85546875, | |
| "reward_std": 0.5010552629828453, | |
| "rewards/correctness_reward_func": 0.4375, | |
| "rewards/strict_format_reward_func": 0.41796875, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 302.72265625, | |
| "epoch": 0.14077952297847585, | |
| "grad_norm": 3.7055581750620203, | |
| "kl": 0.0943603515625, | |
| "learning_rate": 9.682498203361378e-07, | |
| "loss": 0.0038, | |
| "reward": 0.6796875, | |
| "reward_std": 0.431684710085392, | |
| "rewards/correctness_reward_func": 0.2734375, | |
| "rewards/strict_format_reward_func": 0.40625, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 287.06640625, | |
| "epoch": 0.14194299011052938, | |
| "grad_norm": 3.6994429011170293, | |
| "kl": 0.097412109375, | |
| "learning_rate": 9.675852326830254e-07, | |
| "loss": 0.0039, | |
| "reward": 0.599609375, | |
| "reward_std": 0.2685260437428951, | |
| "rewards/correctness_reward_func": 0.1796875, | |
| "rewards/strict_format_reward_func": 0.419921875, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 244.97265625, | |
| "epoch": 0.1431064572425829, | |
| "grad_norm": 5.330519684929318, | |
| "kl": 0.474609375, | |
| "learning_rate": 9.669139942892323e-07, | |
| "loss": 0.019, | |
| "reward": 0.904296875, | |
| "reward_std": 0.45676978677511215, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "rewards/strict_format_reward_func": 0.443359375, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 244.953125, | |
| "epoch": 0.14426992437463643, | |
| "grad_norm": 7.439747574901674, | |
| "kl": 0.082763671875, | |
| "learning_rate": 9.66236114702178e-07, | |
| "loss": 0.0033, | |
| "reward": 0.830078125, | |
| "reward_std": 0.4560399353504181, | |
| "rewards/correctness_reward_func": 0.3828125, | |
| "rewards/strict_format_reward_func": 0.447265625, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 247.43359375, | |
| "epoch": 0.14543339150668994, | |
| "grad_norm": 8.444741434329583, | |
| "kl": 0.191650390625, | |
| "learning_rate": 9.655516035637436e-07, | |
| "loss": 0.0077, | |
| "reward": 0.7890625, | |
| "reward_std": 0.39387788623571396, | |
| "rewards/correctness_reward_func": 0.359375, | |
| "rewards/strict_format_reward_func": 0.4296875, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 286.15625, | |
| "epoch": 0.14659685863874344, | |
| "grad_norm": 1.7413985122485363, | |
| "kl": 0.1680908203125, | |
| "learning_rate": 9.648604706101354e-07, | |
| "loss": 0.0067, | |
| "reward": 0.775390625, | |
| "reward_std": 0.4273899048566818, | |
| "rewards/correctness_reward_func": 0.3515625, | |
| "rewards/strict_format_reward_func": 0.423828125, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 266.35546875, | |
| "epoch": 0.14776032577079698, | |
| "grad_norm": 23.48576289003154, | |
| "kl": 0.429443359375, | |
| "learning_rate": 9.641627256717452e-07, | |
| "loss": 0.0171, | |
| "reward": 0.904296875, | |
| "reward_std": 0.4860465005040169, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "rewards/strict_format_reward_func": 0.443359375, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 280.5, | |
| "epoch": 0.1489237929028505, | |
| "grad_norm": 3.434249282079891, | |
| "kl": 0.12042236328125, | |
| "learning_rate": 9.634583786730108e-07, | |
| "loss": 0.0048, | |
| "reward": 0.912109375, | |
| "reward_std": 0.4505278691649437, | |
| "rewards/correctness_reward_func": 0.4765625, | |
| "rewards/strict_format_reward_func": 0.435546875, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 252.86328125, | |
| "epoch": 0.15008726003490402, | |
| "grad_norm": 0.509688518372827, | |
| "kl": 0.05780029296875, | |
| "learning_rate": 9.627474396322753e-07, | |
| "loss": 0.0023, | |
| "reward": 0.833984375, | |
| "reward_std": 0.3295654430985451, | |
| "rewards/correctness_reward_func": 0.390625, | |
| "rewards/strict_format_reward_func": 0.443359375, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 244.61328125, | |
| "epoch": 0.15125072716695753, | |
| "grad_norm": 322.8549051555672, | |
| "kl": 0.65301513671875, | |
| "learning_rate": 9.62029918661644e-07, | |
| "loss": 0.0261, | |
| "reward": 0.849609375, | |
| "reward_std": 0.40418654680252075, | |
| "rewards/correctness_reward_func": 0.3984375, | |
| "rewards/strict_format_reward_func": 0.451171875, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 274.83203125, | |
| "epoch": 0.15241419429901104, | |
| "grad_norm": 2.5471313815643635, | |
| "kl": 0.08868408203125, | |
| "learning_rate": 9.613058259668414e-07, | |
| "loss": 0.0035, | |
| "reward": 0.732421875, | |
| "reward_std": 0.3115438222885132, | |
| "rewards/correctness_reward_func": 0.28125, | |
| "rewards/strict_format_reward_func": 0.451171875, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 263.140625, | |
| "epoch": 0.15357766143106458, | |
| "grad_norm": 0.4917581912639935, | |
| "kl": 0.05181884765625, | |
| "learning_rate": 9.60575171847065e-07, | |
| "loss": 0.0021, | |
| "reward": 0.8828125, | |
| "reward_std": 0.3844335228204727, | |
| "rewards/correctness_reward_func": 0.421875, | |
| "rewards/strict_format_reward_func": 0.4609375, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 240.77734375, | |
| "epoch": 0.15474112856311809, | |
| "grad_norm": 1.7256071643625732, | |
| "kl": 0.05010986328125, | |
| "learning_rate": 9.598379666948393e-07, | |
| "loss": 0.002, | |
| "reward": 0.923828125, | |
| "reward_std": 0.44481250643730164, | |
| "rewards/correctness_reward_func": 0.46875, | |
| "rewards/strict_format_reward_func": 0.455078125, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 262.2890625, | |
| "epoch": 0.15590459569517162, | |
| "grad_norm": 1.803011519361335, | |
| "kl": 0.06396484375, | |
| "learning_rate": 9.590942209958686e-07, | |
| "loss": 0.0026, | |
| "reward": 0.951171875, | |
| "reward_std": 0.5317578241229057, | |
| "rewards/correctness_reward_func": 0.5078125, | |
| "rewards/strict_format_reward_func": 0.443359375, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 264.09375, | |
| "epoch": 0.15706806282722513, | |
| "grad_norm": 2.320201832882074, | |
| "kl": 0.052978515625, | |
| "learning_rate": 9.583439453288864e-07, | |
| "loss": 0.0021, | |
| "reward": 0.7890625, | |
| "reward_std": 0.4296160414814949, | |
| "rewards/correctness_reward_func": 0.3515625, | |
| "rewards/strict_format_reward_func": 0.4375, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 265.18359375, | |
| "epoch": 0.15823152995927864, | |
| "grad_norm": 0.8648510710425397, | |
| "kl": 0.0433349609375, | |
| "learning_rate": 9.575871503655067e-07, | |
| "loss": 0.0017, | |
| "reward": 1.017578125, | |
| "reward_std": 0.5687045827507973, | |
| "rewards/correctness_reward_func": 0.5546875, | |
| "rewards/strict_format_reward_func": 0.462890625, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 313.328125, | |
| "epoch": 0.15939499709133217, | |
| "grad_norm": 7.228345785913302, | |
| "kl": 0.09490966796875, | |
| "learning_rate": 9.568238468700705e-07, | |
| "loss": 0.0038, | |
| "reward": 0.625, | |
| "reward_std": 0.32062922045588493, | |
| "rewards/correctness_reward_func": 0.1875, | |
| "rewards/strict_format_reward_func": 0.4375, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 272.78125, | |
| "epoch": 0.16055846422338568, | |
| "grad_norm": 4.791778485714245, | |
| "kl": 0.06451416015625, | |
| "learning_rate": 9.560540456994939e-07, | |
| "loss": 0.0026, | |
| "reward": 0.73828125, | |
| "reward_std": 0.3905741199851036, | |
| "rewards/correctness_reward_func": 0.2734375, | |
| "rewards/strict_format_reward_func": 0.46484375, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 284.91015625, | |
| "epoch": 0.16172193135543922, | |
| "grad_norm": 1.0070605023468475, | |
| "kl": 0.0789794921875, | |
| "learning_rate": 9.552777578031133e-07, | |
| "loss": 0.0032, | |
| "reward": 0.86328125, | |
| "reward_std": 0.45894617587327957, | |
| "rewards/correctness_reward_func": 0.4140625, | |
| "rewards/strict_format_reward_func": 0.44921875, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 254.89453125, | |
| "epoch": 0.16288539848749273, | |
| "grad_norm": 0.26164597278669394, | |
| "kl": 0.0531005859375, | |
| "learning_rate": 9.544949942225295e-07, | |
| "loss": 0.0021, | |
| "reward": 0.888671875, | |
| "reward_std": 0.4409971535205841, | |
| "rewards/correctness_reward_func": 0.4453125, | |
| "rewards/strict_format_reward_func": 0.443359375, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 297.02734375, | |
| "epoch": 0.16404886561954624, | |
| "grad_norm": 49.61137761643266, | |
| "kl": 0.19952392578125, | |
| "learning_rate": 9.537057660914508e-07, | |
| "loss": 0.008, | |
| "reward": 0.78515625, | |
| "reward_std": 0.2939911261200905, | |
| "rewards/correctness_reward_func": 0.3359375, | |
| "rewards/strict_format_reward_func": 0.44921875, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 228.41015625, | |
| "epoch": 0.16521233275159977, | |
| "grad_norm": 2.4563226616711726, | |
| "kl": 0.13043212890625, | |
| "learning_rate": 9.529100846355345e-07, | |
| "loss": 0.0052, | |
| "reward": 0.982421875, | |
| "reward_std": 0.36371277645230293, | |
| "rewards/correctness_reward_func": 0.515625, | |
| "rewards/strict_format_reward_func": 0.466796875, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 253.48828125, | |
| "epoch": 0.16637579988365328, | |
| "grad_norm": 1.7492293672269494, | |
| "kl": 0.06890869140625, | |
| "learning_rate": 9.521079611722276e-07, | |
| "loss": 0.0028, | |
| "reward": 0.892578125, | |
| "reward_std": 0.3547302335500717, | |
| "rewards/correctness_reward_func": 0.4296875, | |
| "rewards/strict_format_reward_func": 0.462890625, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 270.91015625, | |
| "epoch": 0.16753926701570682, | |
| "grad_norm": 186.25321871346733, | |
| "kl": 2.019775390625, | |
| "learning_rate": 9.512994071106054e-07, | |
| "loss": 0.0808, | |
| "reward": 0.876953125, | |
| "reward_std": 0.35532546043395996, | |
| "rewards/correctness_reward_func": 0.421875, | |
| "rewards/strict_format_reward_func": 0.455078125, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 251.24609375, | |
| "epoch": 0.16870273414776032, | |
| "grad_norm": 20.433967338967815, | |
| "kl": 0.61883544921875, | |
| "learning_rate": 9.504844339512094e-07, | |
| "loss": 0.0247, | |
| "reward": 1.021484375, | |
| "reward_std": 0.4558466151356697, | |
| "rewards/correctness_reward_func": 0.5546875, | |
| "rewards/strict_format_reward_func": 0.466796875, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 258.37109375, | |
| "epoch": 0.16986620127981383, | |
| "grad_norm": 1.2588068639439651, | |
| "kl": 0.07843017578125, | |
| "learning_rate": 9.49663053285884e-07, | |
| "loss": 0.0031, | |
| "reward": 0.8203125, | |
| "reward_std": 0.40197764337062836, | |
| "rewards/correctness_reward_func": 0.3515625, | |
| "rewards/strict_format_reward_func": 0.46875, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 249.65625, | |
| "epoch": 0.17102966841186737, | |
| "grad_norm": 26.616164942367558, | |
| "kl": 0.16827392578125, | |
| "learning_rate": 9.488352767976109e-07, | |
| "loss": 0.0067, | |
| "reward": 0.951171875, | |
| "reward_std": 0.43833911418914795, | |
| "rewards/correctness_reward_func": 0.484375, | |
| "rewards/strict_format_reward_func": 0.466796875, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 266.21484375, | |
| "epoch": 0.17219313554392088, | |
| "grad_norm": 16.56862163073421, | |
| "kl": 0.13897705078125, | |
| "learning_rate": 9.480011162603434e-07, | |
| "loss": 0.0056, | |
| "reward": 0.994140625, | |
| "reward_std": 0.41204124689102173, | |
| "rewards/correctness_reward_func": 0.5390625, | |
| "rewards/strict_format_reward_func": 0.455078125, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 305.5078125, | |
| "epoch": 0.1733566026759744, | |
| "grad_norm": 0.6862061857964759, | |
| "kl": 0.0863037109375, | |
| "learning_rate": 9.471605835388392e-07, | |
| "loss": 0.0035, | |
| "reward": 0.7421875, | |
| "reward_std": 0.3801525831222534, | |
| "rewards/correctness_reward_func": 0.28125, | |
| "rewards/strict_format_reward_func": 0.4609375, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 257.46484375, | |
| "epoch": 0.17452006980802792, | |
| "grad_norm": 7.269994609894895, | |
| "kl": 0.1292724609375, | |
| "learning_rate": 9.463136905884912e-07, | |
| "loss": 0.0052, | |
| "reward": 0.96484375, | |
| "reward_std": 0.4492557644844055, | |
| "rewards/correctness_reward_func": 0.515625, | |
| "rewards/strict_format_reward_func": 0.44921875, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 269.76171875, | |
| "epoch": 0.17568353694008143, | |
| "grad_norm": 3.546788791129262, | |
| "kl": 0.2747802734375, | |
| "learning_rate": 9.454604494551577e-07, | |
| "loss": 0.011, | |
| "reward": 0.794921875, | |
| "reward_std": 0.49609148502349854, | |
| "rewards/correctness_reward_func": 0.34375, | |
| "rewards/strict_format_reward_func": 0.451171875, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 241.5078125, | |
| "epoch": 0.17684700407213497, | |
| "grad_norm": 19.898589503994543, | |
| "kl": 0.2518310546875, | |
| "learning_rate": 9.446008722749905e-07, | |
| "loss": 0.0101, | |
| "reward": 0.912109375, | |
| "reward_std": 0.3499249704182148, | |
| "rewards/correctness_reward_func": 0.4453125, | |
| "rewards/strict_format_reward_func": 0.466796875, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 312.5625, | |
| "epoch": 0.17801047120418848, | |
| "grad_norm": 7.082944795431713, | |
| "kl": 0.22540283203125, | |
| "learning_rate": 9.437349712742634e-07, | |
| "loss": 0.009, | |
| "reward": 0.93359375, | |
| "reward_std": 0.38279156386852264, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "rewards/strict_format_reward_func": 0.47265625, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 290.33203125, | |
| "epoch": 0.179173938336242, | |
| "grad_norm": 2.204011656968964, | |
| "kl": 0.066650390625, | |
| "learning_rate": 9.428627587691971e-07, | |
| "loss": 0.0027, | |
| "reward": 0.783203125, | |
| "reward_std": 0.37652380019426346, | |
| "rewards/correctness_reward_func": 0.3359375, | |
| "rewards/strict_format_reward_func": 0.447265625, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 291.9921875, | |
| "epoch": 0.18033740546829552, | |
| "grad_norm": 28.05008573486938, | |
| "kl": 0.36279296875, | |
| "learning_rate": 9.419842471657846e-07, | |
| "loss": 0.0145, | |
| "reward": 0.794921875, | |
| "reward_std": 0.45617077499628067, | |
| "rewards/correctness_reward_func": 0.3671875, | |
| "rewards/strict_format_reward_func": 0.427734375, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 286.06640625, | |
| "epoch": 0.18150087260034903, | |
| "grad_norm": 43.77812238882512, | |
| "kl": 2.6201171875, | |
| "learning_rate": 9.410994489596153e-07, | |
| "loss": 0.105, | |
| "reward": 0.904296875, | |
| "reward_std": 0.4255755990743637, | |
| "rewards/correctness_reward_func": 0.453125, | |
| "rewards/strict_format_reward_func": 0.451171875, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 241.26953125, | |
| "epoch": 0.18266433973240256, | |
| "grad_norm": 10.84079560138156, | |
| "kl": 1.619384765625, | |
| "learning_rate": 9.402083767356957e-07, | |
| "loss": 0.0646, | |
| "reward": 0.904296875, | |
| "reward_std": 0.3274926654994488, | |
| "rewards/correctness_reward_func": 0.4453125, | |
| "rewards/strict_format_reward_func": 0.458984375, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 248.51171875, | |
| "epoch": 0.18382780686445607, | |
| "grad_norm": 65.24252743780556, | |
| "kl": 2.162109375, | |
| "learning_rate": 9.393110431682721e-07, | |
| "loss": 0.0867, | |
| "reward": 0.85546875, | |
| "reward_std": 0.3152204602956772, | |
| "rewards/correctness_reward_func": 0.390625, | |
| "rewards/strict_format_reward_func": 0.46484375, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 283.03125, | |
| "epoch": 0.1849912739965096, | |
| "grad_norm": 47.62423123432252, | |
| "kl": 0.57366943359375, | |
| "learning_rate": 9.384074610206493e-07, | |
| "loss": 0.023, | |
| "reward": 0.95703125, | |
| "reward_std": 0.458492249250412, | |
| "rewards/correctness_reward_func": 0.5, | |
| "rewards/strict_format_reward_func": 0.45703125, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 286.9921875, | |
| "epoch": 0.18615474112856312, | |
| "grad_norm": 18.91448421977711, | |
| "kl": 1.2669677734375, | |
| "learning_rate": 9.374976431450094e-07, | |
| "loss": 0.0508, | |
| "reward": 0.943359375, | |
| "reward_std": 0.33836888894438744, | |
| "rewards/correctness_reward_func": 0.484375, | |
| "rewards/strict_format_reward_func": 0.458984375, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 279.0, | |
| "epoch": 0.18731820826061663, | |
| "grad_norm": 706.4286890306558, | |
| "kl": 2.40673828125, | |
| "learning_rate": 9.365816024822288e-07, | |
| "loss": 0.0961, | |
| "reward": 0.9609375, | |
| "reward_std": 0.39441975951194763, | |
| "rewards/correctness_reward_func": 0.5078125, | |
| "rewards/strict_format_reward_func": 0.453125, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 292.0234375, | |
| "epoch": 0.18848167539267016, | |
| "grad_norm": 17.463248908977885, | |
| "kl": 0.15557861328125, | |
| "learning_rate": 9.356593520616946e-07, | |
| "loss": 0.0062, | |
| "reward": 0.943359375, | |
| "reward_std": 0.3664560765028, | |
| "rewards/correctness_reward_func": 0.4921875, | |
| "rewards/strict_format_reward_func": 0.451171875, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 331.6328125, | |
| "epoch": 0.18964514252472367, | |
| "grad_norm": 2.249132481438475, | |
| "kl": 0.0911865234375, | |
| "learning_rate": 9.347309050011186e-07, | |
| "loss": 0.0036, | |
| "reward": 0.765625, | |
| "reward_std": 0.37423864006996155, | |
| "rewards/correctness_reward_func": 0.3125, | |
| "rewards/strict_format_reward_func": 0.453125, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 297.85546875, | |
| "epoch": 0.1908086096567772, | |
| "grad_norm": 0.5787880880323766, | |
| "kl": 0.0926513671875, | |
| "learning_rate": 9.337962745063512e-07, | |
| "loss": 0.0037, | |
| "reward": 0.810546875, | |
| "reward_std": 0.49850574135780334, | |
| "rewards/correctness_reward_func": 0.359375, | |
| "rewards/strict_format_reward_func": 0.451171875, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 265.62109375, | |
| "epoch": 0.19197207678883071, | |
| "grad_norm": 2.6837086070654985, | |
| "kl": 0.05865478515625, | |
| "learning_rate": 9.328554738711935e-07, | |
| "loss": 0.0023, | |
| "reward": 0.921875, | |
| "reward_std": 0.513519324362278, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "rewards/strict_format_reward_func": 0.4609375, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 246.55859375, | |
| "epoch": 0.19313554392088422, | |
| "grad_norm": 3.4134385495340216, | |
| "kl": 0.2457275390625, | |
| "learning_rate": 9.31908516477208e-07, | |
| "loss": 0.0098, | |
| "reward": 0.89453125, | |
| "reward_std": 0.4189570024609566, | |
| "rewards/correctness_reward_func": 0.4375, | |
| "rewards/strict_format_reward_func": 0.45703125, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 246.66015625, | |
| "epoch": 0.19429901105293776, | |
| "grad_norm": 0.9230642941742443, | |
| "kl": 0.11737060546875, | |
| "learning_rate": 9.309554157935286e-07, | |
| "loss": 0.0047, | |
| "reward": 0.96875, | |
| "reward_std": 0.4920966625213623, | |
| "rewards/correctness_reward_func": 0.5078125, | |
| "rewards/strict_format_reward_func": 0.4609375, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 225.83984375, | |
| "epoch": 0.19546247818499127, | |
| "grad_norm": 2.8669767245130577, | |
| "kl": 0.05242919921875, | |
| "learning_rate": 9.299961853766689e-07, | |
| "loss": 0.0021, | |
| "reward": 0.978515625, | |
| "reward_std": 0.443370521068573, | |
| "rewards/correctness_reward_func": 0.5234375, | |
| "rewards/strict_format_reward_func": 0.455078125, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 273.70703125, | |
| "epoch": 0.1966259453170448, | |
| "grad_norm": 0.9798307739601261, | |
| "kl": 0.08355712890625, | |
| "learning_rate": 9.290308388703288e-07, | |
| "loss": 0.0033, | |
| "reward": 0.791015625, | |
| "reward_std": 0.32959311455488205, | |
| "rewards/correctness_reward_func": 0.328125, | |
| "rewards/strict_format_reward_func": 0.462890625, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 306.48828125, | |
| "epoch": 0.1977894124490983, | |
| "grad_norm": 16.018378185855333, | |
| "kl": 0.63409423828125, | |
| "learning_rate": 9.280593900052014e-07, | |
| "loss": 0.0254, | |
| "reward": 0.822265625, | |
| "reward_std": 0.4224512651562691, | |
| "rewards/correctness_reward_func": 0.375, | |
| "rewards/strict_format_reward_func": 0.447265625, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 307.9375, | |
| "epoch": 0.19895287958115182, | |
| "grad_norm": 8.264210864082758, | |
| "kl": 0.29107666015625, | |
| "learning_rate": 9.270818525987771e-07, | |
| "loss": 0.0117, | |
| "reward": 0.87890625, | |
| "reward_std": 0.4745420068502426, | |
| "rewards/correctness_reward_func": 0.4453125, | |
| "rewards/strict_format_reward_func": 0.43359375, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 273.25, | |
| "epoch": 0.20011634671320536, | |
| "grad_norm": 3200.4657340851295, | |
| "kl": 2.255615234375, | |
| "learning_rate": 9.260982405551476e-07, | |
| "loss": 0.0899, | |
| "reward": 0.892578125, | |
| "reward_std": 0.49684761464595795, | |
| "rewards/correctness_reward_func": 0.4375, | |
| "rewards/strict_format_reward_func": 0.455078125, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 301.10546875, | |
| "epoch": 0.20127981384525886, | |
| "grad_norm": 0.4106069684572662, | |
| "kl": 0.06402587890625, | |
| "learning_rate": 9.251085678648071e-07, | |
| "loss": 0.0026, | |
| "reward": 0.775390625, | |
| "reward_std": 0.3673408329486847, | |
| "rewards/correctness_reward_func": 0.3359375, | |
| "rewards/strict_format_reward_func": 0.439453125, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 306.25, | |
| "epoch": 0.2024432809773124, | |
| "grad_norm": 2.4526371721107645, | |
| "kl": 0.0994873046875, | |
| "learning_rate": 9.241128486044542e-07, | |
| "loss": 0.004, | |
| "reward": 0.94140625, | |
| "reward_std": 0.5130884796380997, | |
| "rewards/correctness_reward_func": 0.4921875, | |
| "rewards/strict_format_reward_func": 0.44921875, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 294.6015625, | |
| "epoch": 0.2036067481093659, | |
| "grad_norm": 7.724202386944696, | |
| "kl": 0.11065673828125, | |
| "learning_rate": 9.231110969367918e-07, | |
| "loss": 0.0044, | |
| "reward": 0.8359375, | |
| "reward_std": 0.4581919386982918, | |
| "rewards/correctness_reward_func": 0.3984375, | |
| "rewards/strict_format_reward_func": 0.4375, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 254.26953125, | |
| "epoch": 0.20477021524141942, | |
| "grad_norm": 4.0122059528505085, | |
| "kl": 0.31842041015625, | |
| "learning_rate": 9.221033271103249e-07, | |
| "loss": 0.0127, | |
| "reward": 0.92578125, | |
| "reward_std": 0.4053529165685177, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "rewards/strict_format_reward_func": 0.46484375, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 309.66796875, | |
| "epoch": 0.20593368237347295, | |
| "grad_norm": 2.06130360097307, | |
| "kl": 0.1171875, | |
| "learning_rate": 9.210895534591582e-07, | |
| "loss": 0.0047, | |
| "reward": 0.8984375, | |
| "reward_std": 0.4733218848705292, | |
| "rewards/correctness_reward_func": 0.4453125, | |
| "rewards/strict_format_reward_func": 0.453125, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 323.63671875, | |
| "epoch": 0.20709714950552646, | |
| "grad_norm": 5.004188106979899, | |
| "kl": 0.0877685546875, | |
| "learning_rate": 9.200697904027927e-07, | |
| "loss": 0.0035, | |
| "reward": 0.783203125, | |
| "reward_std": 0.4214430972933769, | |
| "rewards/correctness_reward_func": 0.34375, | |
| "rewards/strict_format_reward_func": 0.439453125, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 305.921875, | |
| "epoch": 0.20826061663758, | |
| "grad_norm": 1.6668849296659807, | |
| "kl": 0.0859375, | |
| "learning_rate": 9.190440524459202e-07, | |
| "loss": 0.0034, | |
| "reward": 0.916015625, | |
| "reward_std": 0.48984475433826447, | |
| "rewards/correctness_reward_func": 0.46875, | |
| "rewards/strict_format_reward_func": 0.447265625, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 246.859375, | |
| "epoch": 0.2094240837696335, | |
| "grad_norm": 3.1773447652134212, | |
| "kl": 0.0968017578125, | |
| "learning_rate": 9.18012354178217e-07, | |
| "loss": 0.0039, | |
| "reward": 1.044921875, | |
| "reward_std": 0.49948541074991226, | |
| "rewards/correctness_reward_func": 0.5859375, | |
| "rewards/strict_format_reward_func": 0.458984375, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 270.046875, | |
| "epoch": 0.21058755090168702, | |
| "grad_norm": 12.240600235073, | |
| "kl": 0.146240234375, | |
| "learning_rate": 9.16974710274136e-07, | |
| "loss": 0.0059, | |
| "reward": 0.888671875, | |
| "reward_std": 0.4256081059575081, | |
| "rewards/correctness_reward_func": 0.4375, | |
| "rewards/strict_format_reward_func": 0.451171875, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 277.4921875, | |
| "epoch": 0.21175101803374055, | |
| "grad_norm": 41.948078009277715, | |
| "kl": 0.93603515625, | |
| "learning_rate": 9.159311354926989e-07, | |
| "loss": 0.0376, | |
| "reward": 0.994140625, | |
| "reward_std": 0.42895379662513733, | |
| "rewards/correctness_reward_func": 0.53125, | |
| "rewards/strict_format_reward_func": 0.462890625, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 270.48046875, | |
| "epoch": 0.21291448516579406, | |
| "grad_norm": 1.2973502608915635, | |
| "kl": 0.06439208984375, | |
| "learning_rate": 9.148816446772858e-07, | |
| "loss": 0.0026, | |
| "reward": 0.755859375, | |
| "reward_std": 0.29502545297145844, | |
| "rewards/correctness_reward_func": 0.3046875, | |
| "rewards/strict_format_reward_func": 0.451171875, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 297.19921875, | |
| "epoch": 0.2140779522978476, | |
| "grad_norm": 2.500567548574192, | |
| "kl": 0.09307861328125, | |
| "learning_rate": 9.138262527554237e-07, | |
| "loss": 0.0037, | |
| "reward": 0.91796875, | |
| "reward_std": 0.4057604745030403, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "rewards/strict_format_reward_func": 0.45703125, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 226.44140625, | |
| "epoch": 0.2152414194299011, | |
| "grad_norm": 14.743103963897818, | |
| "kl": 0.131103515625, | |
| "learning_rate": 9.127649747385748e-07, | |
| "loss": 0.0052, | |
| "reward": 1.005859375, | |
| "reward_std": 0.48278648406267166, | |
| "rewards/correctness_reward_func": 0.5390625, | |
| "rewards/strict_format_reward_func": 0.466796875, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 295.98046875, | |
| "epoch": 0.2164048865619546, | |
| "grad_norm": 2.599485686164368, | |
| "kl": 0.10174560546875, | |
| "learning_rate": 9.116978257219223e-07, | |
| "loss": 0.0041, | |
| "reward": 0.900390625, | |
| "reward_std": 0.4165002331137657, | |
| "rewards/correctness_reward_func": 0.4453125, | |
| "rewards/strict_format_reward_func": 0.455078125, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 273.453125, | |
| "epoch": 0.21756835369400815, | |
| "grad_norm": 1.9090617402205379, | |
| "kl": 0.08551025390625, | |
| "learning_rate": 9.106248208841568e-07, | |
| "loss": 0.0034, | |
| "reward": 0.875, | |
| "reward_std": 0.3356376253068447, | |
| "rewards/correctness_reward_func": 0.40625, | |
| "rewards/strict_format_reward_func": 0.46875, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 262.65625, | |
| "epoch": 0.21873182082606166, | |
| "grad_norm": 144.5858579696907, | |
| "kl": 1.45574951171875, | |
| "learning_rate": 9.095459754872588e-07, | |
| "loss": 0.0584, | |
| "reward": 0.818359375, | |
| "reward_std": 0.437059473246336, | |
| "rewards/correctness_reward_func": 0.3515625, | |
| "rewards/strict_format_reward_func": 0.466796875, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 292.56640625, | |
| "epoch": 0.2198952879581152, | |
| "grad_norm": 1.0988973248113927, | |
| "kl": 0.07281494140625, | |
| "learning_rate": 9.084613048762833e-07, | |
| "loss": 0.0029, | |
| "reward": 0.90625, | |
| "reward_std": 0.5009823776781559, | |
| "rewards/correctness_reward_func": 0.4609375, | |
| "rewards/strict_format_reward_func": 0.4453125, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 297.6640625, | |
| "epoch": 0.2210587550901687, | |
| "grad_norm": 0.7765766278689193, | |
| "kl": 0.0587158203125, | |
| "learning_rate": 9.073708244791405e-07, | |
| "loss": 0.0023, | |
| "reward": 0.9609375, | |
| "reward_std": 0.39268171787261963, | |
| "rewards/correctness_reward_func": 0.4921875, | |
| "rewards/strict_format_reward_func": 0.46875, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 259.953125, | |
| "epoch": 0.2222222222222222, | |
| "grad_norm": 0.7753362260289501, | |
| "kl": 0.05169677734375, | |
| "learning_rate": 9.062745498063764e-07, | |
| "loss": 0.0021, | |
| "reward": 0.775390625, | |
| "reward_std": 0.2924235537648201, | |
| "rewards/correctness_reward_func": 0.3125, | |
| "rewards/strict_format_reward_func": 0.462890625, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 347.28515625, | |
| "epoch": 0.22338568935427575, | |
| "grad_norm": 1.6977885523114584, | |
| "kl": 0.0623779296875, | |
| "learning_rate": 9.051724964509526e-07, | |
| "loss": 0.0025, | |
| "reward": 0.88671875, | |
| "reward_std": 0.3994247317314148, | |
| "rewards/correctness_reward_func": 0.4296875, | |
| "rewards/strict_format_reward_func": 0.45703125, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 265.41015625, | |
| "epoch": 0.22454915648632925, | |
| "grad_norm": 18.19050532587102, | |
| "kl": 0.7923583984375, | |
| "learning_rate": 9.040646800880242e-07, | |
| "loss": 0.0317, | |
| "reward": 0.892578125, | |
| "reward_std": 0.5228888541460037, | |
| "rewards/correctness_reward_func": 0.453125, | |
| "rewards/strict_format_reward_func": 0.439453125, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 312.88671875, | |
| "epoch": 0.2257126236183828, | |
| "grad_norm": 0.8340243461295392, | |
| "kl": 0.0703125, | |
| "learning_rate": 9.029511164747175e-07, | |
| "loss": 0.0028, | |
| "reward": 0.83984375, | |
| "reward_std": 0.3894122242927551, | |
| "rewards/correctness_reward_func": 0.40625, | |
| "rewards/strict_format_reward_func": 0.43359375, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 286.765625, | |
| "epoch": 0.2268760907504363, | |
| "grad_norm": 1.200161776433764, | |
| "kl": 0.05950927734375, | |
| "learning_rate": 9.018318214499041e-07, | |
| "loss": 0.0024, | |
| "reward": 0.8203125, | |
| "reward_std": 0.3945379853248596, | |
| "rewards/correctness_reward_func": 0.3671875, | |
| "rewards/strict_format_reward_func": 0.453125, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 247.5234375, | |
| "epoch": 0.2280395578824898, | |
| "grad_norm": 0.3212206681274319, | |
| "kl": 0.06475830078125, | |
| "learning_rate": 9.007068109339783e-07, | |
| "loss": 0.0026, | |
| "reward": 0.94921875, | |
| "reward_std": 0.41352738067507744, | |
| "rewards/correctness_reward_func": 0.5, | |
| "rewards/strict_format_reward_func": 0.44921875, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 242.36328125, | |
| "epoch": 0.22920302501454334, | |
| "grad_norm": 98.99732039672644, | |
| "kl": 0.5511474609375, | |
| "learning_rate": 8.995761009286282e-07, | |
| "loss": 0.0221, | |
| "reward": 0.99609375, | |
| "reward_std": 0.35693345218896866, | |
| "rewards/correctness_reward_func": 0.5546875, | |
| "rewards/strict_format_reward_func": 0.44140625, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 251.578125, | |
| "epoch": 0.23036649214659685, | |
| "grad_norm": 1.7688888702316456, | |
| "kl": 0.063720703125, | |
| "learning_rate": 8.984397075166095e-07, | |
| "loss": 0.0026, | |
| "reward": 0.904296875, | |
| "reward_std": 0.500565767288208, | |
| "rewards/correctness_reward_func": 0.4453125, | |
| "rewards/strict_format_reward_func": 0.458984375, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 289.9921875, | |
| "epoch": 0.2315299592786504, | |
| "grad_norm": 1.0097855862170513, | |
| "kl": 0.0572509765625, | |
| "learning_rate": 8.97297646861516e-07, | |
| "loss": 0.0023, | |
| "reward": 0.830078125, | |
| "reward_std": 0.39669080078601837, | |
| "rewards/correctness_reward_func": 0.3828125, | |
| "rewards/strict_format_reward_func": 0.447265625, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 254.05859375, | |
| "epoch": 0.2326934264107039, | |
| "grad_norm": 1.2860517043327864, | |
| "kl": 0.0604248046875, | |
| "learning_rate": 8.96149935207551e-07, | |
| "loss": 0.0024, | |
| "reward": 0.96875, | |
| "reward_std": 0.3961385078728199, | |
| "rewards/correctness_reward_func": 0.5078125, | |
| "rewards/strict_format_reward_func": 0.4609375, | |
| "step": 200 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 859, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 100, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 4, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |