| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 6.0, | |
| "eval_steps": 500, | |
| "global_step": 498, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "completion_length": 82.65625, | |
| "epoch": 0.012048192771084338, | |
| "grad_norm": 5.841508927710052, | |
| "kl": 0.0, | |
| "learning_rate": 9.97991967871486e-07, | |
| "loss": 0.0, | |
| "reward": 1.4489864706993103, | |
| "reward_std": 0.8421240150928497, | |
| "rewards/accuracy_reward": 0.8005490005016327, | |
| "rewards/format_reward": 0.6484375, | |
| "step": 1 | |
| }, | |
| { | |
| "completion_length": 91.453125, | |
| "epoch": 0.024096385542168676, | |
| "grad_norm": 4.392637703815363, | |
| "kl": 0.00279998779296875, | |
| "learning_rate": 9.959839357429717e-07, | |
| "loss": 0.0001, | |
| "reward": 1.3076424598693848, | |
| "reward_std": 0.8380775451660156, | |
| "rewards/accuracy_reward": 0.6123300492763519, | |
| "rewards/format_reward": 0.6953125, | |
| "step": 2 | |
| }, | |
| { | |
| "completion_length": 79.171875, | |
| "epoch": 0.03614457831325301, | |
| "grad_norm": 5.134937236220538, | |
| "kl": 0.009063720703125, | |
| "learning_rate": 9.93975903614458e-07, | |
| "loss": 0.0004, | |
| "reward": 1.650797963142395, | |
| "reward_std": 0.8256142735481262, | |
| "rewards/accuracy_reward": 0.8773605227470398, | |
| "rewards/format_reward": 0.7734375, | |
| "step": 3 | |
| }, | |
| { | |
| "completion_length": 90.8671875, | |
| "epoch": 0.04819277108433735, | |
| "grad_norm": 4.181043208735878, | |
| "kl": 0.0099029541015625, | |
| "learning_rate": 9.919678714859437e-07, | |
| "loss": 0.0004, | |
| "reward": 1.4978268146514893, | |
| "reward_std": 0.7668428122997284, | |
| "rewards/accuracy_reward": 0.6618892848491669, | |
| "rewards/format_reward": 0.8359375, | |
| "step": 4 | |
| }, | |
| { | |
| "completion_length": 83.15625, | |
| "epoch": 0.060240963855421686, | |
| "grad_norm": 4.623169300333461, | |
| "kl": 0.028106689453125, | |
| "learning_rate": 9.899598393574296e-07, | |
| "loss": 0.0011, | |
| "reward": 1.959537386894226, | |
| "reward_std": 0.6147363781929016, | |
| "rewards/accuracy_reward": 1.0532873272895813, | |
| "rewards/format_reward": 0.90625, | |
| "step": 5 | |
| }, | |
| { | |
| "completion_length": 75.1484375, | |
| "epoch": 0.07228915662650602, | |
| "grad_norm": 5.568012410409197, | |
| "kl": 0.03021240234375, | |
| "learning_rate": 9.879518072289156e-07, | |
| "loss": 0.0012, | |
| "reward": 2.047786593437195, | |
| "reward_std": 0.4053535610437393, | |
| "rewards/accuracy_reward": 1.0946615934371948, | |
| "rewards/format_reward": 0.953125, | |
| "step": 6 | |
| }, | |
| { | |
| "completion_length": 76.03125, | |
| "epoch": 0.08433734939759036, | |
| "grad_norm": 4.7579852016782045, | |
| "kl": 0.033935546875, | |
| "learning_rate": 9.859437751004016e-07, | |
| "loss": 0.0014, | |
| "reward": 2.1630080938339233, | |
| "reward_std": 0.3877447098493576, | |
| "rewards/accuracy_reward": 1.2333204746246338, | |
| "rewards/format_reward": 0.9296875, | |
| "step": 7 | |
| }, | |
| { | |
| "completion_length": 71.546875, | |
| "epoch": 0.0963855421686747, | |
| "grad_norm": 9.256093312505593, | |
| "kl": 0.244384765625, | |
| "learning_rate": 9.839357429718876e-07, | |
| "loss": 0.0097, | |
| "reward": 2.015242576599121, | |
| "reward_std": 0.4337102472782135, | |
| "rewards/accuracy_reward": 1.054305076599121, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 8 | |
| }, | |
| { | |
| "completion_length": 72.1796875, | |
| "epoch": 0.10843373493975904, | |
| "grad_norm": 9.959610046323814, | |
| "kl": 0.2841796875, | |
| "learning_rate": 9.819277108433734e-07, | |
| "loss": 0.0114, | |
| "reward": 1.9989103078842163, | |
| "reward_std": 0.38074547052383423, | |
| "rewards/accuracy_reward": 1.0145351886749268, | |
| "rewards/format_reward": 0.984375, | |
| "step": 9 | |
| }, | |
| { | |
| "completion_length": 67.0078125, | |
| "epoch": 0.12048192771084337, | |
| "grad_norm": 4.494217301954794, | |
| "kl": 0.0677490234375, | |
| "learning_rate": 9.799196787148593e-07, | |
| "loss": 0.0027, | |
| "reward": 2.208647847175598, | |
| "reward_std": 0.20472895354032516, | |
| "rewards/accuracy_reward": 1.2086476683616638, | |
| "rewards/format_reward": 1.0, | |
| "step": 10 | |
| }, | |
| { | |
| "completion_length": 66.3125, | |
| "epoch": 0.13253012048192772, | |
| "grad_norm": 4.205085729740715, | |
| "kl": 0.111083984375, | |
| "learning_rate": 9.779116465863453e-07, | |
| "loss": 0.0044, | |
| "reward": 2.016738772392273, | |
| "reward_std": 0.39626075327396393, | |
| "rewards/accuracy_reward": 1.0323637425899506, | |
| "rewards/format_reward": 0.984375, | |
| "step": 11 | |
| }, | |
| { | |
| "completion_length": 64.2265625, | |
| "epoch": 0.14457831325301204, | |
| "grad_norm": 5.285643902891126, | |
| "kl": 0.0670166015625, | |
| "learning_rate": 9.759036144578313e-07, | |
| "loss": 0.0027, | |
| "reward": 2.0809445977211, | |
| "reward_std": 0.3285638391971588, | |
| "rewards/accuracy_reward": 1.080944538116455, | |
| "rewards/format_reward": 1.0, | |
| "step": 12 | |
| }, | |
| { | |
| "completion_length": 57.7265625, | |
| "epoch": 0.1566265060240964, | |
| "grad_norm": 5.332797970620105, | |
| "kl": 0.07958984375, | |
| "learning_rate": 9.738955823293173e-07, | |
| "loss": 0.0032, | |
| "reward": 2.1677627563476562, | |
| "reward_std": 0.32235731184482574, | |
| "rewards/accuracy_reward": 1.1677626371383667, | |
| "rewards/format_reward": 1.0, | |
| "step": 13 | |
| }, | |
| { | |
| "completion_length": 62.765625, | |
| "epoch": 0.1686746987951807, | |
| "grad_norm": 7.594424067233083, | |
| "kl": 0.086181640625, | |
| "learning_rate": 9.718875502008033e-07, | |
| "loss": 0.0034, | |
| "reward": 2.287484049797058, | |
| "reward_std": 0.2577601447701454, | |
| "rewards/accuracy_reward": 1.3031091094017029, | |
| "rewards/format_reward": 0.984375, | |
| "step": 14 | |
| }, | |
| { | |
| "completion_length": 61.28125, | |
| "epoch": 0.18072289156626506, | |
| "grad_norm": 6.602361615736723, | |
| "kl": 0.087890625, | |
| "learning_rate": 9.69879518072289e-07, | |
| "loss": 0.0035, | |
| "reward": 2.28032910823822, | |
| "reward_std": 0.38463760912418365, | |
| "rewards/accuracy_reward": 1.2881416082382202, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 15 | |
| }, | |
| { | |
| "completion_length": 63.6796875, | |
| "epoch": 0.1927710843373494, | |
| "grad_norm": 4.1986480450121135, | |
| "kl": 0.078125, | |
| "learning_rate": 9.67871485943775e-07, | |
| "loss": 0.0031, | |
| "reward": 2.1277613639831543, | |
| "reward_std": 0.2963729351758957, | |
| "rewards/accuracy_reward": 1.1433865427970886, | |
| "rewards/format_reward": 0.984375, | |
| "step": 16 | |
| }, | |
| { | |
| "completion_length": 60.65625, | |
| "epoch": 0.20481927710843373, | |
| "grad_norm": 6.921299965436032, | |
| "kl": 0.088134765625, | |
| "learning_rate": 9.65863453815261e-07, | |
| "loss": 0.0035, | |
| "reward": 2.157727599143982, | |
| "reward_std": 0.30868735909461975, | |
| "rewards/accuracy_reward": 1.1733525395393372, | |
| "rewards/format_reward": 0.984375, | |
| "step": 17 | |
| }, | |
| { | |
| "completion_length": 59.2265625, | |
| "epoch": 0.21686746987951808, | |
| "grad_norm": 4.904213548043611, | |
| "kl": 0.07666015625, | |
| "learning_rate": 9.63855421686747e-07, | |
| "loss": 0.0031, | |
| "reward": 2.24626088142395, | |
| "reward_std": 0.22766248881816864, | |
| "rewards/accuracy_reward": 1.2540735006332397, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 18 | |
| }, | |
| { | |
| "completion_length": 58.703125, | |
| "epoch": 0.2289156626506024, | |
| "grad_norm": 4.786279154756674, | |
| "kl": 0.109619140625, | |
| "learning_rate": 9.61847389558233e-07, | |
| "loss": 0.0044, | |
| "reward": 2.050855040550232, | |
| "reward_std": 0.35161878168582916, | |
| "rewards/accuracy_reward": 1.0586674511432648, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 19 | |
| }, | |
| { | |
| "completion_length": 58.109375, | |
| "epoch": 0.24096385542168675, | |
| "grad_norm": 4.05967579782597, | |
| "kl": 0.08056640625, | |
| "learning_rate": 9.598393574297187e-07, | |
| "loss": 0.0032, | |
| "reward": 2.20633327960968, | |
| "reward_std": 0.3129453659057617, | |
| "rewards/accuracy_reward": 1.2219581604003906, | |
| "rewards/format_reward": 0.984375, | |
| "step": 20 | |
| }, | |
| { | |
| "completion_length": 57.71875, | |
| "epoch": 0.25301204819277107, | |
| "grad_norm": 5.8300935596675885, | |
| "kl": 0.080078125, | |
| "learning_rate": 9.57831325301205e-07, | |
| "loss": 0.0032, | |
| "reward": 2.417273759841919, | |
| "reward_std": 0.28760989010334015, | |
| "rewards/accuracy_reward": 1.4250862002372742, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 21 | |
| }, | |
| { | |
| "completion_length": 54.5859375, | |
| "epoch": 0.26506024096385544, | |
| "grad_norm": 7.535044861581114, | |
| "kl": 0.106201171875, | |
| "learning_rate": 9.558232931726907e-07, | |
| "loss": 0.0042, | |
| "reward": 2.2527129650115967, | |
| "reward_std": 0.2951706647872925, | |
| "rewards/accuracy_reward": 1.2683378458023071, | |
| "rewards/format_reward": 0.984375, | |
| "step": 22 | |
| }, | |
| { | |
| "completion_length": 61.09375, | |
| "epoch": 0.27710843373493976, | |
| "grad_norm": 4.416172924233661, | |
| "kl": 0.10009765625, | |
| "learning_rate": 9.538152610441766e-07, | |
| "loss": 0.004, | |
| "reward": 2.1894314289093018, | |
| "reward_std": 0.21257736533880234, | |
| "rewards/accuracy_reward": 1.1894314289093018, | |
| "rewards/format_reward": 1.0, | |
| "step": 23 | |
| }, | |
| { | |
| "completion_length": 54.9921875, | |
| "epoch": 0.2891566265060241, | |
| "grad_norm": 4.553446996976198, | |
| "kl": 0.09814453125, | |
| "learning_rate": 9.518072289156625e-07, | |
| "loss": 0.0039, | |
| "reward": 2.3037142753601074, | |
| "reward_std": 0.3323938250541687, | |
| "rewards/accuracy_reward": 1.3115268349647522, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 24 | |
| }, | |
| { | |
| "completion_length": 55.9921875, | |
| "epoch": 0.30120481927710846, | |
| "grad_norm": 8.671383785487564, | |
| "kl": 0.120849609375, | |
| "learning_rate": 9.497991967871486e-07, | |
| "loss": 0.0048, | |
| "reward": 2.239556074142456, | |
| "reward_std": 0.3447880446910858, | |
| "rewards/accuracy_reward": 1.2551808953285217, | |
| "rewards/format_reward": 0.984375, | |
| "step": 25 | |
| }, | |
| { | |
| "completion_length": 58.7890625, | |
| "epoch": 0.3132530120481928, | |
| "grad_norm": 8.322624639517006, | |
| "kl": 0.12353515625, | |
| "learning_rate": 9.477911646586345e-07, | |
| "loss": 0.0049, | |
| "reward": 2.2209770679473877, | |
| "reward_std": 0.3139883056282997, | |
| "rewards/accuracy_reward": 1.2287896275520325, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 26 | |
| }, | |
| { | |
| "completion_length": 56.2421875, | |
| "epoch": 0.3253012048192771, | |
| "grad_norm": 20.55146941012377, | |
| "kl": 0.130126953125, | |
| "learning_rate": 9.457831325301205e-07, | |
| "loss": 0.0052, | |
| "reward": 2.344720959663391, | |
| "reward_std": 0.25742725282907486, | |
| "rewards/accuracy_reward": 1.3525334596633911, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 27 | |
| }, | |
| { | |
| "completion_length": 52.3671875, | |
| "epoch": 0.3373493975903614, | |
| "grad_norm": 4.550988243582887, | |
| "kl": 0.12548828125, | |
| "learning_rate": 9.437751004016063e-07, | |
| "loss": 0.005, | |
| "reward": 2.407941460609436, | |
| "reward_std": 0.3139786869287491, | |
| "rewards/accuracy_reward": 1.4313790798187256, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 28 | |
| }, | |
| { | |
| "completion_length": 53.328125, | |
| "epoch": 0.3493975903614458, | |
| "grad_norm": 5.133796660962732, | |
| "kl": 0.1435546875, | |
| "learning_rate": 9.417670682730924e-07, | |
| "loss": 0.0057, | |
| "reward": 2.3306795358657837, | |
| "reward_std": 0.3039723336696625, | |
| "rewards/accuracy_reward": 1.3463045954704285, | |
| "rewards/format_reward": 0.984375, | |
| "step": 29 | |
| }, | |
| { | |
| "completion_length": 53.8125, | |
| "epoch": 0.3614457831325301, | |
| "grad_norm": 6.796717577260548, | |
| "kl": 0.27880859375, | |
| "learning_rate": 9.397590361445783e-07, | |
| "loss": 0.0112, | |
| "reward": 2.2834625244140625, | |
| "reward_std": 0.3063512295484543, | |
| "rewards/accuracy_reward": 1.2834625244140625, | |
| "rewards/format_reward": 1.0, | |
| "step": 30 | |
| }, | |
| { | |
| "completion_length": 56.3203125, | |
| "epoch": 0.37349397590361444, | |
| "grad_norm": 4.3393989853337285, | |
| "kl": 0.14794921875, | |
| "learning_rate": 9.377510040160642e-07, | |
| "loss": 0.0059, | |
| "reward": 2.354575991630554, | |
| "reward_std": 0.314766064286232, | |
| "rewards/accuracy_reward": 1.3623886704444885, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 31 | |
| }, | |
| { | |
| "completion_length": 54.171875, | |
| "epoch": 0.3855421686746988, | |
| "grad_norm": 4.279946209704863, | |
| "kl": 0.197265625, | |
| "learning_rate": 9.357429718875502e-07, | |
| "loss": 0.0079, | |
| "reward": 2.1385136246681213, | |
| "reward_std": 0.24586574733257294, | |
| "rewards/accuracy_reward": 1.1463261544704437, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 32 | |
| }, | |
| { | |
| "completion_length": 51.4140625, | |
| "epoch": 0.39759036144578314, | |
| "grad_norm": 5.88762957444806, | |
| "kl": 0.1630859375, | |
| "learning_rate": 9.33734939759036e-07, | |
| "loss": 0.0065, | |
| "reward": 2.2907108068466187, | |
| "reward_std": 0.25231631100177765, | |
| "rewards/accuracy_reward": 1.2907109260559082, | |
| "rewards/format_reward": 1.0, | |
| "step": 33 | |
| }, | |
| { | |
| "completion_length": 50.4609375, | |
| "epoch": 0.40963855421686746, | |
| "grad_norm": 5.469228934242547, | |
| "kl": 0.16845703125, | |
| "learning_rate": 9.317269076305221e-07, | |
| "loss": 0.0067, | |
| "reward": 2.2533600330352783, | |
| "reward_std": 0.25808002054691315, | |
| "rewards/accuracy_reward": 1.2611725330352783, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 34 | |
| }, | |
| { | |
| "completion_length": 47.84375, | |
| "epoch": 0.42168674698795183, | |
| "grad_norm": 5.412602747215773, | |
| "kl": 0.177734375, | |
| "learning_rate": 9.29718875502008e-07, | |
| "loss": 0.0071, | |
| "reward": 2.3132054805755615, | |
| "reward_std": 0.2454073503613472, | |
| "rewards/accuracy_reward": 1.3132054805755615, | |
| "rewards/format_reward": 1.0, | |
| "step": 35 | |
| }, | |
| { | |
| "completion_length": 44.21875, | |
| "epoch": 0.43373493975903615, | |
| "grad_norm": 5.190368238545804, | |
| "kl": 0.2275390625, | |
| "learning_rate": 9.27710843373494e-07, | |
| "loss": 0.0091, | |
| "reward": 2.2854232788085938, | |
| "reward_std": 0.29085223376750946, | |
| "rewards/accuracy_reward": 1.293235719203949, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 36 | |
| }, | |
| { | |
| "completion_length": 48.71875, | |
| "epoch": 0.4457831325301205, | |
| "grad_norm": 4.780274291960778, | |
| "kl": 0.20751953125, | |
| "learning_rate": 9.257028112449798e-07, | |
| "loss": 0.0083, | |
| "reward": 2.246184825897217, | |
| "reward_std": 0.31601477414369583, | |
| "rewards/accuracy_reward": 1.261809766292572, | |
| "rewards/format_reward": 0.984375, | |
| "step": 37 | |
| }, | |
| { | |
| "completion_length": 42.265625, | |
| "epoch": 0.4578313253012048, | |
| "grad_norm": 6.234590681750942, | |
| "kl": 0.265625, | |
| "learning_rate": 9.236947791164659e-07, | |
| "loss": 0.0106, | |
| "reward": 2.112604260444641, | |
| "reward_std": 0.30199334025382996, | |
| "rewards/accuracy_reward": 1.1126042604446411, | |
| "rewards/format_reward": 1.0, | |
| "step": 38 | |
| }, | |
| { | |
| "completion_length": 45.1015625, | |
| "epoch": 0.46987951807228917, | |
| "grad_norm": 4.611394363412455, | |
| "kl": 0.15576171875, | |
| "learning_rate": 9.216867469879518e-07, | |
| "loss": 0.0062, | |
| "reward": 2.3590028285980225, | |
| "reward_std": 0.2973439395427704, | |
| "rewards/accuracy_reward": 1.3746278285980225, | |
| "rewards/format_reward": 0.984375, | |
| "step": 39 | |
| }, | |
| { | |
| "completion_length": 45.3046875, | |
| "epoch": 0.4819277108433735, | |
| "grad_norm": 6.117578716606278, | |
| "kl": 0.17626953125, | |
| "learning_rate": 9.196787148594377e-07, | |
| "loss": 0.0071, | |
| "reward": 2.2271867990493774, | |
| "reward_std": 0.22323830425739288, | |
| "rewards/accuracy_reward": 1.234999418258667, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 40 | |
| }, | |
| { | |
| "completion_length": 41.9453125, | |
| "epoch": 0.4939759036144578, | |
| "grad_norm": 4.858430237306144, | |
| "kl": 0.2236328125, | |
| "learning_rate": 9.176706827309237e-07, | |
| "loss": 0.0089, | |
| "reward": 2.217424750328064, | |
| "reward_std": 0.2663164809346199, | |
| "rewards/accuracy_reward": 1.2252373099327087, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 41 | |
| }, | |
| { | |
| "completion_length": 41.0234375, | |
| "epoch": 0.5060240963855421, | |
| "grad_norm": 4.127212546225013, | |
| "kl": 0.18212890625, | |
| "learning_rate": 9.156626506024095e-07, | |
| "loss": 0.0073, | |
| "reward": 2.16755473613739, | |
| "reward_std": 0.3387562334537506, | |
| "rewards/accuracy_reward": 1.1753671169281006, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 42 | |
| }, | |
| { | |
| "completion_length": 42.6640625, | |
| "epoch": 0.5180722891566265, | |
| "grad_norm": 5.226665280180925, | |
| "kl": 0.23193359375, | |
| "learning_rate": 9.136546184738956e-07, | |
| "loss": 0.0093, | |
| "reward": 2.203770875930786, | |
| "reward_std": 0.3409430831670761, | |
| "rewards/accuracy_reward": 1.2350206971168518, | |
| "rewards/format_reward": 0.96875, | |
| "step": 43 | |
| }, | |
| { | |
| "completion_length": 40.9609375, | |
| "epoch": 0.5301204819277109, | |
| "grad_norm": 4.308668359699942, | |
| "kl": 0.134033203125, | |
| "learning_rate": 9.116465863453815e-07, | |
| "loss": 0.0054, | |
| "reward": 2.2817225456237793, | |
| "reward_std": 0.19574209302663803, | |
| "rewards/accuracy_reward": 1.281722605228424, | |
| "rewards/format_reward": 1.0, | |
| "step": 44 | |
| }, | |
| { | |
| "completion_length": 38.7734375, | |
| "epoch": 0.5421686746987951, | |
| "grad_norm": 6.033974360622575, | |
| "kl": 0.13232421875, | |
| "learning_rate": 9.096385542168675e-07, | |
| "loss": 0.0053, | |
| "reward": 2.2139052152633667, | |
| "reward_std": 0.28486668318510056, | |
| "rewards/accuracy_reward": 1.2451552748680115, | |
| "rewards/format_reward": 0.96875, | |
| "step": 45 | |
| }, | |
| { | |
| "completion_length": 41.1484375, | |
| "epoch": 0.5542168674698795, | |
| "grad_norm": 5.314865555502224, | |
| "kl": 0.11279296875, | |
| "learning_rate": 9.076305220883533e-07, | |
| "loss": 0.0045, | |
| "reward": 2.4188212156295776, | |
| "reward_std": 0.2556447684764862, | |
| "rewards/accuracy_reward": 1.4266336560249329, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 46 | |
| }, | |
| { | |
| "completion_length": 42.7109375, | |
| "epoch": 0.5662650602409639, | |
| "grad_norm": 3.687080063413381, | |
| "kl": 0.123046875, | |
| "learning_rate": 9.056224899598393e-07, | |
| "loss": 0.0049, | |
| "reward": 2.2985291481018066, | |
| "reward_std": 0.2858593165874481, | |
| "rewards/accuracy_reward": 1.3063417077064514, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 47 | |
| }, | |
| { | |
| "completion_length": 46.859375, | |
| "epoch": 0.5783132530120482, | |
| "grad_norm": 4.277184476359137, | |
| "kl": 0.20166015625, | |
| "learning_rate": 9.036144578313253e-07, | |
| "loss": 0.0081, | |
| "reward": 2.1704814434051514, | |
| "reward_std": 0.3619203567504883, | |
| "rewards/accuracy_reward": 1.186106562614441, | |
| "rewards/format_reward": 0.984375, | |
| "step": 48 | |
| }, | |
| { | |
| "completion_length": 45.21875, | |
| "epoch": 0.5903614457831325, | |
| "grad_norm": 3.7971557376020577, | |
| "kl": 0.124267578125, | |
| "learning_rate": 9.016064257028112e-07, | |
| "loss": 0.005, | |
| "reward": 2.1000068187713623, | |
| "reward_std": 0.2924596816301346, | |
| "rewards/accuracy_reward": 1.123444378376007, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 49 | |
| }, | |
| { | |
| "completion_length": 44.7734375, | |
| "epoch": 0.6024096385542169, | |
| "grad_norm": 4.458817172061971, | |
| "kl": 0.111083984375, | |
| "learning_rate": 8.995983935742972e-07, | |
| "loss": 0.0044, | |
| "reward": 2.2635247707366943, | |
| "reward_std": 0.3522821515798569, | |
| "rewards/accuracy_reward": 1.2869621515274048, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 50 | |
| }, | |
| { | |
| "completion_length": 51.5859375, | |
| "epoch": 0.6144578313253012, | |
| "grad_norm": 5.351600002967812, | |
| "kl": 0.115234375, | |
| "learning_rate": 8.97590361445783e-07, | |
| "loss": 0.0046, | |
| "reward": 2.321009397506714, | |
| "reward_std": 0.23405297100543976, | |
| "rewards/accuracy_reward": 1.3366344571113586, | |
| "rewards/format_reward": 0.984375, | |
| "step": 51 | |
| }, | |
| { | |
| "completion_length": 50.421875, | |
| "epoch": 0.6265060240963856, | |
| "grad_norm": 4.213335817741083, | |
| "kl": 0.1396484375, | |
| "learning_rate": 8.955823293172691e-07, | |
| "loss": 0.0056, | |
| "reward": 2.3553450107574463, | |
| "reward_std": 0.25443293899297714, | |
| "rewards/accuracy_reward": 1.3944076299667358, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 52 | |
| }, | |
| { | |
| "completion_length": 60.6015625, | |
| "epoch": 0.6385542168674698, | |
| "grad_norm": 6.123689334744157, | |
| "kl": 0.121337890625, | |
| "learning_rate": 8.93574297188755e-07, | |
| "loss": 0.0049, | |
| "reward": 2.112071990966797, | |
| "reward_std": 0.30149899423122406, | |
| "rewards/accuracy_reward": 1.1433220505714417, | |
| "rewards/format_reward": 0.96875, | |
| "step": 53 | |
| }, | |
| { | |
| "completion_length": 50.0703125, | |
| "epoch": 0.6506024096385542, | |
| "grad_norm": 4.396654754831157, | |
| "kl": 0.1337890625, | |
| "learning_rate": 8.915662650602409e-07, | |
| "loss": 0.0053, | |
| "reward": 2.233729839324951, | |
| "reward_std": 0.23247240483760834, | |
| "rewards/accuracy_reward": 1.2571672797203064, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 54 | |
| }, | |
| { | |
| "completion_length": 60.2890625, | |
| "epoch": 0.6626506024096386, | |
| "grad_norm": 7.03985835954293, | |
| "kl": 0.10498046875, | |
| "learning_rate": 8.895582329317268e-07, | |
| "loss": 0.0042, | |
| "reward": 2.196902871131897, | |
| "reward_std": 0.2882121652364731, | |
| "rewards/accuracy_reward": 1.2125278115272522, | |
| "rewards/format_reward": 0.984375, | |
| "step": 55 | |
| }, | |
| { | |
| "completion_length": 50.640625, | |
| "epoch": 0.6746987951807228, | |
| "grad_norm": 4.86896494949543, | |
| "kl": 0.12451171875, | |
| "learning_rate": 8.875502008032128e-07, | |
| "loss": 0.005, | |
| "reward": 2.171112537384033, | |
| "reward_std": 0.16461243480443954, | |
| "rewards/accuracy_reward": 1.1867375373840332, | |
| "rewards/format_reward": 0.984375, | |
| "step": 56 | |
| }, | |
| { | |
| "completion_length": 53.21875, | |
| "epoch": 0.6867469879518072, | |
| "grad_norm": 3.557538165261062, | |
| "kl": 0.1240234375, | |
| "learning_rate": 8.855421686746988e-07, | |
| "loss": 0.005, | |
| "reward": 2.2328275442123413, | |
| "reward_std": 0.2752218544483185, | |
| "rewards/accuracy_reward": 1.2406402230262756, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 57 | |
| }, | |
| { | |
| "completion_length": 47.8671875, | |
| "epoch": 0.6987951807228916, | |
| "grad_norm": 5.180162989820259, | |
| "kl": 0.125, | |
| "learning_rate": 8.835341365461847e-07, | |
| "loss": 0.005, | |
| "reward": 2.2453041076660156, | |
| "reward_std": 0.315682128071785, | |
| "rewards/accuracy_reward": 1.268741488456726, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 58 | |
| }, | |
| { | |
| "completion_length": 57.9765625, | |
| "epoch": 0.7108433734939759, | |
| "grad_norm": 3.899105782667564, | |
| "kl": 0.10205078125, | |
| "learning_rate": 8.815261044176707e-07, | |
| "loss": 0.0041, | |
| "reward": 2.284543514251709, | |
| "reward_std": 0.25333235412836075, | |
| "rewards/accuracy_reward": 1.292356252670288, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 59 | |
| }, | |
| { | |
| "completion_length": 46.5859375, | |
| "epoch": 0.7228915662650602, | |
| "grad_norm": 13.765129472909528, | |
| "kl": 0.106201171875, | |
| "learning_rate": 8.795180722891565e-07, | |
| "loss": 0.0042, | |
| "reward": 2.113099694252014, | |
| "reward_std": 0.326066330075264, | |
| "rewards/accuracy_reward": 1.1287246942520142, | |
| "rewards/format_reward": 0.984375, | |
| "step": 60 | |
| }, | |
| { | |
| "completion_length": 46.375, | |
| "epoch": 0.7349397590361446, | |
| "grad_norm": 6.1270425433473, | |
| "kl": 0.16357421875, | |
| "learning_rate": 8.775100401606425e-07, | |
| "loss": 0.0065, | |
| "reward": 1.9968695640563965, | |
| "reward_std": 0.34320104122161865, | |
| "rewards/accuracy_reward": 1.0124945640563965, | |
| "rewards/format_reward": 0.984375, | |
| "step": 61 | |
| }, | |
| { | |
| "completion_length": 53.09375, | |
| "epoch": 0.7469879518072289, | |
| "grad_norm": 4.3056291481606745, | |
| "kl": 0.1513671875, | |
| "learning_rate": 8.755020080321285e-07, | |
| "loss": 0.0061, | |
| "reward": 2.1780970096588135, | |
| "reward_std": 0.2706674858927727, | |
| "rewards/accuracy_reward": 1.2093469500541687, | |
| "rewards/format_reward": 0.96875, | |
| "step": 62 | |
| }, | |
| { | |
| "completion_length": 55.9375, | |
| "epoch": 0.7590361445783133, | |
| "grad_norm": 3.2395174572422416, | |
| "kl": 0.14501953125, | |
| "learning_rate": 8.734939759036144e-07, | |
| "loss": 0.0058, | |
| "reward": 2.1430922746658325, | |
| "reward_std": 0.24412654340267181, | |
| "rewards/accuracy_reward": 1.1665297150611877, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 63 | |
| }, | |
| { | |
| "completion_length": 56.6328125, | |
| "epoch": 0.7710843373493976, | |
| "grad_norm": 4.190814109425291, | |
| "kl": 0.11962890625, | |
| "learning_rate": 8.714859437751003e-07, | |
| "loss": 0.0048, | |
| "reward": 2.1700193881988525, | |
| "reward_std": 0.2942150831222534, | |
| "rewards/accuracy_reward": 1.1934569478034973, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 64 | |
| }, | |
| { | |
| "completion_length": 64.3984375, | |
| "epoch": 0.7831325301204819, | |
| "grad_norm": 3.226137200230793, | |
| "kl": 0.102783203125, | |
| "learning_rate": 8.694779116465863e-07, | |
| "loss": 0.0041, | |
| "reward": 2.2898290157318115, | |
| "reward_std": 0.2443845123052597, | |
| "rewards/accuracy_reward": 1.3132665753364563, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 65 | |
| }, | |
| { | |
| "completion_length": 67.7109375, | |
| "epoch": 0.7951807228915663, | |
| "grad_norm": 3.9157620361816314, | |
| "kl": 0.0927734375, | |
| "learning_rate": 8.674698795180723e-07, | |
| "loss": 0.0037, | |
| "reward": 2.161790609359741, | |
| "reward_std": 0.29590657353401184, | |
| "rewards/accuracy_reward": 1.1696029901504517, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 66 | |
| }, | |
| { | |
| "completion_length": 74.3203125, | |
| "epoch": 0.8072289156626506, | |
| "grad_norm": 3.1212414712368375, | |
| "kl": 0.082763671875, | |
| "learning_rate": 8.654618473895582e-07, | |
| "loss": 0.0033, | |
| "reward": 2.215745210647583, | |
| "reward_std": 0.2766411006450653, | |
| "rewards/accuracy_reward": 1.2313700914382935, | |
| "rewards/format_reward": 0.984375, | |
| "step": 67 | |
| }, | |
| { | |
| "completion_length": 74.0390625, | |
| "epoch": 0.8192771084337349, | |
| "grad_norm": 3.446969302283755, | |
| "kl": 0.074951171875, | |
| "learning_rate": 8.634538152610441e-07, | |
| "loss": 0.003, | |
| "reward": 2.1964612007141113, | |
| "reward_std": 0.235237754881382, | |
| "rewards/accuracy_reward": 1.2198986411094666, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 68 | |
| }, | |
| { | |
| "completion_length": 76.9375, | |
| "epoch": 0.8313253012048193, | |
| "grad_norm": 3.310962519125171, | |
| "kl": 0.08154296875, | |
| "learning_rate": 8.614457831325301e-07, | |
| "loss": 0.0033, | |
| "reward": 2.1269989013671875, | |
| "reward_std": 0.2448011264204979, | |
| "rewards/accuracy_reward": 1.1426239013671875, | |
| "rewards/format_reward": 0.984375, | |
| "step": 69 | |
| }, | |
| { | |
| "completion_length": 71.3984375, | |
| "epoch": 0.8433734939759037, | |
| "grad_norm": 3.2998576155248966, | |
| "kl": 0.0888671875, | |
| "learning_rate": 8.59437751004016e-07, | |
| "loss": 0.0036, | |
| "reward": 2.2479825019836426, | |
| "reward_std": 0.2886482775211334, | |
| "rewards/accuracy_reward": 1.2636074423789978, | |
| "rewards/format_reward": 0.984375, | |
| "step": 70 | |
| }, | |
| { | |
| "completion_length": 72.1484375, | |
| "epoch": 0.8554216867469879, | |
| "grad_norm": 7.668000907111886, | |
| "kl": 0.07861328125, | |
| "learning_rate": 8.57429718875502e-07, | |
| "loss": 0.0031, | |
| "reward": 2.2247371673583984, | |
| "reward_std": 0.2391326129436493, | |
| "rewards/accuracy_reward": 1.2637996673583984, | |
| "rewards/format_reward": 0.9609375, | |
| "step": 71 | |
| }, | |
| { | |
| "completion_length": 77.7734375, | |
| "epoch": 0.8674698795180723, | |
| "grad_norm": 3.4104191137958013, | |
| "kl": 0.068359375, | |
| "learning_rate": 8.554216867469879e-07, | |
| "loss": 0.0027, | |
| "reward": 2.2031702995300293, | |
| "reward_std": 0.21321924775838852, | |
| "rewards/accuracy_reward": 1.210982859134674, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 72 | |
| }, | |
| { | |
| "completion_length": 76.5546875, | |
| "epoch": 0.8795180722891566, | |
| "grad_norm": 3.884229840630286, | |
| "kl": 0.0947265625, | |
| "learning_rate": 8.534136546184738e-07, | |
| "loss": 0.0038, | |
| "reward": 2.2307136058807373, | |
| "reward_std": 0.2959597185254097, | |
| "rewards/accuracy_reward": 1.2463387250900269, | |
| "rewards/format_reward": 0.984375, | |
| "step": 73 | |
| }, | |
| { | |
| "completion_length": 73.7265625, | |
| "epoch": 0.891566265060241, | |
| "grad_norm": 7.2397255809983525, | |
| "kl": 0.170654296875, | |
| "learning_rate": 8.514056224899598e-07, | |
| "loss": 0.0068, | |
| "reward": 2.311343193054199, | |
| "reward_std": 0.21377335488796234, | |
| "rewards/accuracy_reward": 1.319155752658844, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 74 | |
| }, | |
| { | |
| "completion_length": 71.5859375, | |
| "epoch": 0.9036144578313253, | |
| "grad_norm": 3.397020763244455, | |
| "kl": 0.073974609375, | |
| "learning_rate": 8.493975903614458e-07, | |
| "loss": 0.003, | |
| "reward": 2.3479005098342896, | |
| "reward_std": 0.2722414582967758, | |
| "rewards/accuracy_reward": 1.3713379502296448, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 75 | |
| }, | |
| { | |
| "completion_length": 64.34375, | |
| "epoch": 0.9156626506024096, | |
| "grad_norm": 4.709358727325993, | |
| "kl": 0.116455078125, | |
| "learning_rate": 8.473895582329317e-07, | |
| "loss": 0.0047, | |
| "reward": 2.1038066148757935, | |
| "reward_std": 0.3149692267179489, | |
| "rewards/accuracy_reward": 1.158493995666504, | |
| "rewards/format_reward": 0.9453125, | |
| "step": 76 | |
| }, | |
| { | |
| "completion_length": 69.390625, | |
| "epoch": 0.927710843373494, | |
| "grad_norm": 3.3768601117352923, | |
| "kl": 0.11376953125, | |
| "learning_rate": 8.453815261044176e-07, | |
| "loss": 0.0046, | |
| "reward": 2.02778023481369, | |
| "reward_std": 0.3105141818523407, | |
| "rewards/accuracy_reward": 1.074655294418335, | |
| "rewards/format_reward": 0.953125, | |
| "step": 77 | |
| }, | |
| { | |
| "completion_length": 67.328125, | |
| "epoch": 0.9397590361445783, | |
| "grad_norm": 3.504578270706009, | |
| "kl": 0.115234375, | |
| "learning_rate": 8.433734939759036e-07, | |
| "loss": 0.0046, | |
| "reward": 2.194709539413452, | |
| "reward_std": 0.27273692935705185, | |
| "rewards/accuracy_reward": 1.2181469202041626, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 78 | |
| }, | |
| { | |
| "completion_length": 75.1640625, | |
| "epoch": 0.9518072289156626, | |
| "grad_norm": 4.043012399812061, | |
| "kl": 0.123046875, | |
| "learning_rate": 8.413654618473895e-07, | |
| "loss": 0.0049, | |
| "reward": 2.13509202003479, | |
| "reward_std": 0.313528910279274, | |
| "rewards/accuracy_reward": 1.18196702003479, | |
| "rewards/format_reward": 0.953125, | |
| "step": 79 | |
| }, | |
| { | |
| "completion_length": 70.0234375, | |
| "epoch": 0.963855421686747, | |
| "grad_norm": 4.870660538899373, | |
| "kl": 0.086181640625, | |
| "learning_rate": 8.393574297188755e-07, | |
| "loss": 0.0035, | |
| "reward": 2.1953389644622803, | |
| "reward_std": 0.26908765733242035, | |
| "rewards/accuracy_reward": 1.2265888452529907, | |
| "rewards/format_reward": 0.96875, | |
| "step": 80 | |
| }, | |
| { | |
| "completion_length": 80.859375, | |
| "epoch": 0.9759036144578314, | |
| "grad_norm": 3.8261245848047065, | |
| "kl": 0.1015625, | |
| "learning_rate": 8.373493975903614e-07, | |
| "loss": 0.0041, | |
| "reward": 2.0212653279304504, | |
| "reward_std": 0.3835397958755493, | |
| "rewards/accuracy_reward": 1.0915777683258057, | |
| "rewards/format_reward": 0.9296875, | |
| "step": 81 | |
| }, | |
| { | |
| "completion_length": 74.046875, | |
| "epoch": 0.9879518072289156, | |
| "grad_norm": 4.0964460767880535, | |
| "kl": 0.083984375, | |
| "learning_rate": 8.353413654618474e-07, | |
| "loss": 0.0034, | |
| "reward": 2.2536615133285522, | |
| "reward_std": 0.2658763527870178, | |
| "rewards/accuracy_reward": 1.2770991325378418, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 82 | |
| }, | |
| { | |
| "completion_length": 74.58333587646484, | |
| "epoch": 1.0, | |
| "grad_norm": 2.9272571318373655, | |
| "kl": 0.1044921875, | |
| "learning_rate": 8.333333333333333e-07, | |
| "loss": 0.004, | |
| "reward": 2.1187774538993835, | |
| "reward_std": 0.1469321921467781, | |
| "rewards/accuracy_reward": 1.1187774240970612, | |
| "rewards/format_reward": 1.0, | |
| "step": 83 | |
| }, | |
| { | |
| "completion_length": 67.5390625, | |
| "epoch": 1.0120481927710843, | |
| "grad_norm": 4.360041456699287, | |
| "kl": 0.116455078125, | |
| "learning_rate": 8.313253012048192e-07, | |
| "loss": 0.0047, | |
| "reward": 2.2748764753341675, | |
| "reward_std": 0.30198951065540314, | |
| "rewards/accuracy_reward": 1.2983139157295227, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 84 | |
| }, | |
| { | |
| "completion_length": 71.640625, | |
| "epoch": 1.0240963855421688, | |
| "grad_norm": 3.852904865115574, | |
| "kl": 0.100341796875, | |
| "learning_rate": 8.293172690763052e-07, | |
| "loss": 0.004, | |
| "reward": 2.22179639339447, | |
| "reward_std": 0.2614322751760483, | |
| "rewards/accuracy_reward": 1.2452340126037598, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 85 | |
| }, | |
| { | |
| "completion_length": 77.71875, | |
| "epoch": 1.036144578313253, | |
| "grad_norm": 4.570601093607917, | |
| "kl": 0.086181640625, | |
| "learning_rate": 8.273092369477911e-07, | |
| "loss": 0.0034, | |
| "reward": 2.3267804384231567, | |
| "reward_std": 0.1871008574962616, | |
| "rewards/accuracy_reward": 1.3424054384231567, | |
| "rewards/format_reward": 0.984375, | |
| "step": 86 | |
| }, | |
| { | |
| "completion_length": 74.0703125, | |
| "epoch": 1.0481927710843373, | |
| "grad_norm": 4.387034223472388, | |
| "kl": 0.09033203125, | |
| "learning_rate": 8.253012048192771e-07, | |
| "loss": 0.0036, | |
| "reward": 2.280067205429077, | |
| "reward_std": 0.2090277522802353, | |
| "rewards/accuracy_reward": 1.2800670266151428, | |
| "rewards/format_reward": 1.0, | |
| "step": 87 | |
| }, | |
| { | |
| "completion_length": 72.8828125, | |
| "epoch": 1.0602409638554218, | |
| "grad_norm": 3.640432077142004, | |
| "kl": 0.097412109375, | |
| "learning_rate": 8.23293172690763e-07, | |
| "loss": 0.0039, | |
| "reward": 2.2264442443847656, | |
| "reward_std": 0.2877971976995468, | |
| "rewards/accuracy_reward": 1.2576942443847656, | |
| "rewards/format_reward": 0.96875, | |
| "step": 88 | |
| }, | |
| { | |
| "completion_length": 68.9765625, | |
| "epoch": 1.072289156626506, | |
| "grad_norm": 3.6617214501921755, | |
| "kl": 0.10107421875, | |
| "learning_rate": 8.21285140562249e-07, | |
| "loss": 0.004, | |
| "reward": 2.232625722885132, | |
| "reward_std": 0.26599176973104477, | |
| "rewards/accuracy_reward": 1.2482507824897766, | |
| "rewards/format_reward": 0.984375, | |
| "step": 89 | |
| }, | |
| { | |
| "completion_length": 74.765625, | |
| "epoch": 1.0843373493975903, | |
| "grad_norm": 4.600311265578528, | |
| "kl": 0.09130859375, | |
| "learning_rate": 8.192771084337349e-07, | |
| "loss": 0.0037, | |
| "reward": 2.253629207611084, | |
| "reward_std": 0.21175827831029892, | |
| "rewards/accuracy_reward": 1.269254207611084, | |
| "rewards/format_reward": 0.984375, | |
| "step": 90 | |
| }, | |
| { | |
| "completion_length": 76.59375, | |
| "epoch": 1.0963855421686748, | |
| "grad_norm": 4.145602929032845, | |
| "kl": 0.087646484375, | |
| "learning_rate": 8.172690763052207e-07, | |
| "loss": 0.0035, | |
| "reward": 2.2744953632354736, | |
| "reward_std": 0.24358398467302322, | |
| "rewards/accuracy_reward": 1.2901203632354736, | |
| "rewards/format_reward": 0.984375, | |
| "step": 91 | |
| }, | |
| { | |
| "completion_length": 75.875, | |
| "epoch": 1.108433734939759, | |
| "grad_norm": 3.8292102418969853, | |
| "kl": 0.10693359375, | |
| "learning_rate": 8.152610441767068e-07, | |
| "loss": 0.0043, | |
| "reward": 2.4102468490600586, | |
| "reward_std": 0.22168071568012238, | |
| "rewards/accuracy_reward": 1.4180592894554138, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 92 | |
| }, | |
| { | |
| "completion_length": 73.5078125, | |
| "epoch": 1.1204819277108433, | |
| "grad_norm": 3.889694391559541, | |
| "kl": 0.0859375, | |
| "learning_rate": 8.132530120481927e-07, | |
| "loss": 0.0034, | |
| "reward": 2.19115674495697, | |
| "reward_std": 0.191669300198555, | |
| "rewards/accuracy_reward": 1.1989692449569702, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 93 | |
| }, | |
| { | |
| "completion_length": 74.359375, | |
| "epoch": 1.1325301204819278, | |
| "grad_norm": 13.572499915490392, | |
| "kl": 0.115966796875, | |
| "learning_rate": 8.112449799196787e-07, | |
| "loss": 0.0046, | |
| "reward": 2.3821544647216797, | |
| "reward_std": 0.2079356163740158, | |
| "rewards/accuracy_reward": 1.3899668455123901, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 94 | |
| }, | |
| { | |
| "completion_length": 70.875, | |
| "epoch": 1.144578313253012, | |
| "grad_norm": 3.96863603284974, | |
| "kl": 0.096923828125, | |
| "learning_rate": 8.092369477911646e-07, | |
| "loss": 0.0039, | |
| "reward": 2.301279664039612, | |
| "reward_std": 0.17724627256393433, | |
| "rewards/accuracy_reward": 1.309092104434967, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 95 | |
| }, | |
| { | |
| "completion_length": 69.3125, | |
| "epoch": 1.1566265060240963, | |
| "grad_norm": 3.4379001474745206, | |
| "kl": 0.090087890625, | |
| "learning_rate": 8.072289156626506e-07, | |
| "loss": 0.0036, | |
| "reward": 2.371612310409546, | |
| "reward_std": 0.1584479957818985, | |
| "rewards/accuracy_reward": 1.371612310409546, | |
| "rewards/format_reward": 1.0, | |
| "step": 96 | |
| }, | |
| { | |
| "completion_length": 68.6171875, | |
| "epoch": 1.1686746987951806, | |
| "grad_norm": 4.586260816062996, | |
| "kl": 0.09375, | |
| "learning_rate": 8.052208835341365e-07, | |
| "loss": 0.0037, | |
| "reward": 2.4862219095230103, | |
| "reward_std": 0.20000579208135605, | |
| "rewards/accuracy_reward": 1.4862220287322998, | |
| "rewards/format_reward": 1.0, | |
| "step": 97 | |
| }, | |
| { | |
| "completion_length": 70.015625, | |
| "epoch": 1.180722891566265, | |
| "grad_norm": 4.047101829945655, | |
| "kl": 0.112060546875, | |
| "learning_rate": 8.032128514056225e-07, | |
| "loss": 0.0045, | |
| "reward": 2.2514266967773438, | |
| "reward_std": 0.22294947504997253, | |
| "rewards/accuracy_reward": 1.2514267563819885, | |
| "rewards/format_reward": 1.0, | |
| "step": 98 | |
| }, | |
| { | |
| "completion_length": 66.9140625, | |
| "epoch": 1.1927710843373494, | |
| "grad_norm": 5.444249065473958, | |
| "kl": 0.088134765625, | |
| "learning_rate": 8.012048192771084e-07, | |
| "loss": 0.0035, | |
| "reward": 2.333179473876953, | |
| "reward_std": 0.1811930388212204, | |
| "rewards/accuracy_reward": 1.3331794738769531, | |
| "rewards/format_reward": 1.0, | |
| "step": 99 | |
| }, | |
| { | |
| "completion_length": 65.828125, | |
| "epoch": 1.2048192771084336, | |
| "grad_norm": 7.074570957060863, | |
| "kl": 0.1064453125, | |
| "learning_rate": 7.991967871485942e-07, | |
| "loss": 0.0043, | |
| "reward": 2.278498649597168, | |
| "reward_std": 0.17714769393205643, | |
| "rewards/accuracy_reward": 1.2863109111785889, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 100 | |
| }, | |
| { | |
| "completion_length": 62.6875, | |
| "epoch": 1.216867469879518, | |
| "grad_norm": 6.600402598086416, | |
| "kl": 0.099609375, | |
| "learning_rate": 7.971887550200803e-07, | |
| "loss": 0.004, | |
| "reward": 2.3798866271972656, | |
| "reward_std": 0.1492375209927559, | |
| "rewards/accuracy_reward": 1.3798866868019104, | |
| "rewards/format_reward": 1.0, | |
| "step": 101 | |
| }, | |
| { | |
| "completion_length": 67.234375, | |
| "epoch": 1.2289156626506024, | |
| "grad_norm": 5.4322907915163645, | |
| "kl": 0.0927734375, | |
| "learning_rate": 7.951807228915662e-07, | |
| "loss": 0.0037, | |
| "reward": 2.295409917831421, | |
| "reward_std": 0.26540718972682953, | |
| "rewards/accuracy_reward": 1.311034917831421, | |
| "rewards/format_reward": 0.984375, | |
| "step": 102 | |
| }, | |
| { | |
| "completion_length": 62.59375, | |
| "epoch": 1.2409638554216866, | |
| "grad_norm": 4.734234621294123, | |
| "kl": 0.10986328125, | |
| "learning_rate": 7.931726907630522e-07, | |
| "loss": 0.0044, | |
| "reward": 2.3131519556045532, | |
| "reward_std": 0.2041746824979782, | |
| "rewards/accuracy_reward": 1.3209643959999084, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 103 | |
| }, | |
| { | |
| "completion_length": 65.0078125, | |
| "epoch": 1.2530120481927711, | |
| "grad_norm": 11.27432402123553, | |
| "kl": 0.094482421875, | |
| "learning_rate": 7.911646586345381e-07, | |
| "loss": 0.0038, | |
| "reward": 2.423591375350952, | |
| "reward_std": 0.17853456735610962, | |
| "rewards/accuracy_reward": 1.4235913753509521, | |
| "rewards/format_reward": 1.0, | |
| "step": 104 | |
| }, | |
| { | |
| "completion_length": 61.96875, | |
| "epoch": 1.2650602409638554, | |
| "grad_norm": 5.605209449566961, | |
| "kl": 0.10595703125, | |
| "learning_rate": 7.891566265060241e-07, | |
| "loss": 0.0042, | |
| "reward": 2.2498486042022705, | |
| "reward_std": 0.2505866587162018, | |
| "rewards/accuracy_reward": 1.2576610445976257, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 105 | |
| }, | |
| { | |
| "completion_length": 69.890625, | |
| "epoch": 1.2771084337349397, | |
| "grad_norm": 9.555144265496201, | |
| "kl": 0.1015625, | |
| "learning_rate": 7.8714859437751e-07, | |
| "loss": 0.0041, | |
| "reward": 2.153669834136963, | |
| "reward_std": 0.2159716784954071, | |
| "rewards/accuracy_reward": 1.161482334136963, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 106 | |
| }, | |
| { | |
| "completion_length": 63.5625, | |
| "epoch": 1.2891566265060241, | |
| "grad_norm": 4.205528221959235, | |
| "kl": 0.100341796875, | |
| "learning_rate": 7.851405622489959e-07, | |
| "loss": 0.004, | |
| "reward": 2.2599010467529297, | |
| "reward_std": 0.22189538180828094, | |
| "rewards/accuracy_reward": 1.2599008083343506, | |
| "rewards/format_reward": 1.0, | |
| "step": 107 | |
| }, | |
| { | |
| "completion_length": 60.3359375, | |
| "epoch": 1.3012048192771084, | |
| "grad_norm": 4.549607105799596, | |
| "kl": 0.13525390625, | |
| "learning_rate": 7.831325301204819e-07, | |
| "loss": 0.0054, | |
| "reward": 2.2945663928985596, | |
| "reward_std": 0.2269488275051117, | |
| "rewards/accuracy_reward": 1.2945663928985596, | |
| "rewards/format_reward": 1.0, | |
| "step": 108 | |
| }, | |
| { | |
| "completion_length": 63.9765625, | |
| "epoch": 1.3132530120481927, | |
| "grad_norm": 7.122658458301131, | |
| "kl": 0.10400390625, | |
| "learning_rate": 7.811244979919679e-07, | |
| "loss": 0.0042, | |
| "reward": 2.223813772201538, | |
| "reward_std": 0.2691728472709656, | |
| "rewards/accuracy_reward": 1.2316263318061829, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 109 | |
| }, | |
| { | |
| "completion_length": 64.0390625, | |
| "epoch": 1.3253012048192772, | |
| "grad_norm": 4.0970391288989285, | |
| "kl": 0.102783203125, | |
| "learning_rate": 7.791164658634538e-07, | |
| "loss": 0.0041, | |
| "reward": 2.402035713195801, | |
| "reward_std": 0.2192593812942505, | |
| "rewards/accuracy_reward": 1.409848153591156, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 110 | |
| }, | |
| { | |
| "completion_length": 61.984375, | |
| "epoch": 1.3373493975903614, | |
| "grad_norm": 5.00798288991921, | |
| "kl": 0.100830078125, | |
| "learning_rate": 7.771084337349397e-07, | |
| "loss": 0.004, | |
| "reward": 2.268544912338257, | |
| "reward_std": 0.17878198623657227, | |
| "rewards/accuracy_reward": 1.2685450315475464, | |
| "rewards/format_reward": 1.0, | |
| "step": 111 | |
| }, | |
| { | |
| "completion_length": 58.296875, | |
| "epoch": 1.3493975903614457, | |
| "grad_norm": 4.283142882967245, | |
| "kl": 0.10888671875, | |
| "learning_rate": 7.751004016064257e-07, | |
| "loss": 0.0044, | |
| "reward": 2.373852849006653, | |
| "reward_std": 0.17504306137561798, | |
| "rewards/accuracy_reward": 1.3738529086112976, | |
| "rewards/format_reward": 1.0, | |
| "step": 112 | |
| }, | |
| { | |
| "completion_length": 60.484375, | |
| "epoch": 1.3614457831325302, | |
| "grad_norm": 4.840347639337677, | |
| "kl": 0.097412109375, | |
| "learning_rate": 7.730923694779116e-07, | |
| "loss": 0.0039, | |
| "reward": 2.2944198846817017, | |
| "reward_std": 0.2088237851858139, | |
| "rewards/accuracy_reward": 1.2944198250770569, | |
| "rewards/format_reward": 1.0, | |
| "step": 113 | |
| }, | |
| { | |
| "completion_length": 59.6328125, | |
| "epoch": 1.3734939759036144, | |
| "grad_norm": 3.441438097506757, | |
| "kl": 0.095458984375, | |
| "learning_rate": 7.710843373493975e-07, | |
| "loss": 0.0038, | |
| "reward": 2.2015284299850464, | |
| "reward_std": 0.22288134694099426, | |
| "rewards/accuracy_reward": 1.201528549194336, | |
| "rewards/format_reward": 1.0, | |
| "step": 114 | |
| }, | |
| { | |
| "completion_length": 58.3203125, | |
| "epoch": 1.3855421686746987, | |
| "grad_norm": 5.2560716101244545, | |
| "kl": 0.12890625, | |
| "learning_rate": 7.690763052208835e-07, | |
| "loss": 0.0052, | |
| "reward": 2.395646095275879, | |
| "reward_std": 0.21848639845848083, | |
| "rewards/accuracy_reward": 1.3956461548805237, | |
| "rewards/format_reward": 1.0, | |
| "step": 115 | |
| }, | |
| { | |
| "completion_length": 58.2734375, | |
| "epoch": 1.3975903614457832, | |
| "grad_norm": 5.450406858307557, | |
| "kl": 0.1064453125, | |
| "learning_rate": 7.670682730923694e-07, | |
| "loss": 0.0043, | |
| "reward": 2.4746010303497314, | |
| "reward_std": 0.1482101045548916, | |
| "rewards/accuracy_reward": 1.4746010303497314, | |
| "rewards/format_reward": 1.0, | |
| "step": 116 | |
| }, | |
| { | |
| "completion_length": 57.65625, | |
| "epoch": 1.4096385542168675, | |
| "grad_norm": 4.642950561404122, | |
| "kl": 0.124267578125, | |
| "learning_rate": 7.650602409638554e-07, | |
| "loss": 0.005, | |
| "reward": 2.1899147033691406, | |
| "reward_std": 0.2073155865073204, | |
| "rewards/accuracy_reward": 1.1977271437644958, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 117 | |
| }, | |
| { | |
| "completion_length": 56.609375, | |
| "epoch": 1.4216867469879517, | |
| "grad_norm": 9.36763410057133, | |
| "kl": 0.112548828125, | |
| "learning_rate": 7.630522088353414e-07, | |
| "loss": 0.0045, | |
| "reward": 2.457427501678467, | |
| "reward_std": 0.248141810297966, | |
| "rewards/accuracy_reward": 1.4574276804924011, | |
| "rewards/format_reward": 1.0, | |
| "step": 118 | |
| }, | |
| { | |
| "completion_length": 55.59375, | |
| "epoch": 1.4337349397590362, | |
| "grad_norm": 4.076025029890633, | |
| "kl": 0.095947265625, | |
| "learning_rate": 7.610441767068273e-07, | |
| "loss": 0.0038, | |
| "reward": 2.3175806999206543, | |
| "reward_std": 0.21353702247142792, | |
| "rewards/accuracy_reward": 1.3175806999206543, | |
| "rewards/format_reward": 1.0, | |
| "step": 119 | |
| }, | |
| { | |
| "completion_length": 56.359375, | |
| "epoch": 1.4457831325301205, | |
| "grad_norm": 4.1118838634058905, | |
| "kl": 0.10693359375, | |
| "learning_rate": 7.590361445783132e-07, | |
| "loss": 0.0043, | |
| "reward": 2.306099772453308, | |
| "reward_std": 0.2674330025911331, | |
| "rewards/accuracy_reward": 1.3217247128486633, | |
| "rewards/format_reward": 0.984375, | |
| "step": 120 | |
| }, | |
| { | |
| "completion_length": 56.765625, | |
| "epoch": 1.4578313253012047, | |
| "grad_norm": 4.370520474393478, | |
| "kl": 0.10302734375, | |
| "learning_rate": 7.570281124497991e-07, | |
| "loss": 0.0041, | |
| "reward": 2.1378331184387207, | |
| "reward_std": 0.24683931469917297, | |
| "rewards/accuracy_reward": 1.1378332376480103, | |
| "rewards/format_reward": 1.0, | |
| "step": 121 | |
| }, | |
| { | |
| "completion_length": 61.4453125, | |
| "epoch": 1.4698795180722892, | |
| "grad_norm": 3.7827942646929427, | |
| "kl": 0.120361328125, | |
| "learning_rate": 7.550200803212851e-07, | |
| "loss": 0.0048, | |
| "reward": 2.1952574253082275, | |
| "reward_std": 0.163675457239151, | |
| "rewards/accuracy_reward": 1.1952574849128723, | |
| "rewards/format_reward": 1.0, | |
| "step": 122 | |
| }, | |
| { | |
| "completion_length": 64.2734375, | |
| "epoch": 1.4819277108433735, | |
| "grad_norm": 3.7942059326042887, | |
| "kl": 0.115478515625, | |
| "learning_rate": 7.53012048192771e-07, | |
| "loss": 0.0046, | |
| "reward": 2.052876114845276, | |
| "reward_std": 0.3279467225074768, | |
| "rewards/accuracy_reward": 1.0606885850429535, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 123 | |
| }, | |
| { | |
| "completion_length": 61.7578125, | |
| "epoch": 1.4939759036144578, | |
| "grad_norm": 4.163145774578374, | |
| "kl": 0.1083984375, | |
| "learning_rate": 7.51004016064257e-07, | |
| "loss": 0.0043, | |
| "reward": 2.483773946762085, | |
| "reward_std": 0.21236886084079742, | |
| "rewards/accuracy_reward": 1.483773946762085, | |
| "rewards/format_reward": 1.0, | |
| "step": 124 | |
| }, | |
| { | |
| "completion_length": 69.8359375, | |
| "epoch": 1.5060240963855422, | |
| "grad_norm": 8.540024207287942, | |
| "kl": 0.122314453125, | |
| "learning_rate": 7.489959839357429e-07, | |
| "loss": 0.0049, | |
| "reward": 2.207366466522217, | |
| "reward_std": 0.22365009784698486, | |
| "rewards/accuracy_reward": 1.2073664665222168, | |
| "rewards/format_reward": 1.0, | |
| "step": 125 | |
| }, | |
| { | |
| "completion_length": 68.21875, | |
| "epoch": 1.5180722891566265, | |
| "grad_norm": 4.163585518888115, | |
| "kl": 0.097412109375, | |
| "learning_rate": 7.469879518072289e-07, | |
| "loss": 0.0039, | |
| "reward": 2.3682451248168945, | |
| "reward_std": 0.17314215004444122, | |
| "rewards/accuracy_reward": 1.3682451844215393, | |
| "rewards/format_reward": 1.0, | |
| "step": 126 | |
| }, | |
| { | |
| "completion_length": 74.7734375, | |
| "epoch": 1.5301204819277108, | |
| "grad_norm": 5.7954755578535595, | |
| "kl": 0.09912109375, | |
| "learning_rate": 7.449799196787149e-07, | |
| "loss": 0.004, | |
| "reward": 2.3054428100585938, | |
| "reward_std": 0.166117824614048, | |
| "rewards/accuracy_reward": 1.313255250453949, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 127 | |
| }, | |
| { | |
| "completion_length": 77.3046875, | |
| "epoch": 1.5421686746987953, | |
| "grad_norm": 4.318669163836461, | |
| "kl": 0.091796875, | |
| "learning_rate": 7.429718875502008e-07, | |
| "loss": 0.0037, | |
| "reward": 2.1308990716934204, | |
| "reward_std": 0.19852972030639648, | |
| "rewards/accuracy_reward": 1.13089919090271, | |
| "rewards/format_reward": 1.0, | |
| "step": 128 | |
| }, | |
| { | |
| "completion_length": 78.1015625, | |
| "epoch": 1.5542168674698795, | |
| "grad_norm": 4.096032296356097, | |
| "kl": 0.102783203125, | |
| "learning_rate": 7.409638554216867e-07, | |
| "loss": 0.0041, | |
| "reward": 2.445680260658264, | |
| "reward_std": 0.1704091727733612, | |
| "rewards/accuracy_reward": 1.4456802010536194, | |
| "rewards/format_reward": 1.0, | |
| "step": 129 | |
| }, | |
| { | |
| "completion_length": 74.75, | |
| "epoch": 1.5662650602409638, | |
| "grad_norm": 4.47404453525868, | |
| "kl": 0.100341796875, | |
| "learning_rate": 7.389558232931726e-07, | |
| "loss": 0.004, | |
| "reward": 2.2448705434799194, | |
| "reward_std": 0.21340852975845337, | |
| "rewards/accuracy_reward": 1.2448704838752747, | |
| "rewards/format_reward": 1.0, | |
| "step": 130 | |
| }, | |
| { | |
| "completion_length": 75.3671875, | |
| "epoch": 1.5783132530120483, | |
| "grad_norm": 23.135090346261265, | |
| "kl": 1.1025390625, | |
| "learning_rate": 7.369477911646586e-07, | |
| "loss": 0.0444, | |
| "reward": 2.368005871772766, | |
| "reward_std": 0.24276328086853027, | |
| "rewards/accuracy_reward": 1.3680058717727661, | |
| "rewards/format_reward": 1.0, | |
| "step": 131 | |
| }, | |
| { | |
| "completion_length": 76.5234375, | |
| "epoch": 1.5903614457831325, | |
| "grad_norm": 3.560296625305877, | |
| "kl": 0.14111328125, | |
| "learning_rate": 7.349397590361446e-07, | |
| "loss": 0.0056, | |
| "reward": 2.3832234144210815, | |
| "reward_std": 0.2271246314048767, | |
| "rewards/accuracy_reward": 1.398848533630371, | |
| "rewards/format_reward": 0.984375, | |
| "step": 132 | |
| }, | |
| { | |
| "completion_length": 78.515625, | |
| "epoch": 1.6024096385542168, | |
| "grad_norm": 4.271885997013165, | |
| "kl": 0.103271484375, | |
| "learning_rate": 7.329317269076305e-07, | |
| "loss": 0.0041, | |
| "reward": 2.11967396736145, | |
| "reward_std": 0.21069814264774323, | |
| "rewards/accuracy_reward": 1.119674026966095, | |
| "rewards/format_reward": 1.0, | |
| "step": 133 | |
| }, | |
| { | |
| "completion_length": 81.2109375, | |
| "epoch": 1.6144578313253013, | |
| "grad_norm": 3.989749340172797, | |
| "kl": 0.10009765625, | |
| "learning_rate": 7.309236947791164e-07, | |
| "loss": 0.004, | |
| "reward": 2.2381746768951416, | |
| "reward_std": 0.2712934762239456, | |
| "rewards/accuracy_reward": 1.2537997961044312, | |
| "rewards/format_reward": 0.984375, | |
| "step": 134 | |
| }, | |
| { | |
| "completion_length": 84.828125, | |
| "epoch": 1.6265060240963856, | |
| "grad_norm": 5.101727030105181, | |
| "kl": 0.0927734375, | |
| "learning_rate": 7.289156626506024e-07, | |
| "loss": 0.0037, | |
| "reward": 2.3006190061569214, | |
| "reward_std": 0.2388201355934143, | |
| "rewards/accuracy_reward": 1.3084314465522766, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 135 | |
| }, | |
| { | |
| "completion_length": 78.3984375, | |
| "epoch": 1.6385542168674698, | |
| "grad_norm": 7.945369222479043, | |
| "kl": 0.109130859375, | |
| "learning_rate": 7.269076305220884e-07, | |
| "loss": 0.0044, | |
| "reward": 2.187756061553955, | |
| "reward_std": 0.22536994516849518, | |
| "rewards/accuracy_reward": 1.2033808827400208, | |
| "rewards/format_reward": 0.984375, | |
| "step": 136 | |
| }, | |
| { | |
| "completion_length": 83.0234375, | |
| "epoch": 1.6506024096385543, | |
| "grad_norm": 7.511759922163927, | |
| "kl": 0.074462890625, | |
| "learning_rate": 7.248995983935742e-07, | |
| "loss": 0.003, | |
| "reward": 2.299572706222534, | |
| "reward_std": 0.22408785670995712, | |
| "rewards/accuracy_reward": 1.3073852062225342, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 137 | |
| }, | |
| { | |
| "completion_length": 84.640625, | |
| "epoch": 1.6626506024096386, | |
| "grad_norm": 3.2982396535282623, | |
| "kl": 0.0810546875, | |
| "learning_rate": 7.228915662650602e-07, | |
| "loss": 0.0032, | |
| "reward": 2.3804391622543335, | |
| "reward_std": 0.2060808688402176, | |
| "rewards/accuracy_reward": 1.3804389834403992, | |
| "rewards/format_reward": 1.0, | |
| "step": 138 | |
| }, | |
| { | |
| "completion_length": 87.8125, | |
| "epoch": 1.6746987951807228, | |
| "grad_norm": 8.41708008218346, | |
| "kl": 0.0810546875, | |
| "learning_rate": 7.208835341365461e-07, | |
| "loss": 0.0032, | |
| "reward": 2.2146860361099243, | |
| "reward_std": 0.2540859431028366, | |
| "rewards/accuracy_reward": 1.2146860361099243, | |
| "rewards/format_reward": 1.0, | |
| "step": 139 | |
| }, | |
| { | |
| "completion_length": 86.140625, | |
| "epoch": 1.6867469879518073, | |
| "grad_norm": 3.5435273544538815, | |
| "kl": 0.072998046875, | |
| "learning_rate": 7.188755020080321e-07, | |
| "loss": 0.0029, | |
| "reward": 2.3307693004608154, | |
| "reward_std": 0.20385809987783432, | |
| "rewards/accuracy_reward": 1.3385818004608154, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 140 | |
| }, | |
| { | |
| "completion_length": 85.9375, | |
| "epoch": 1.6987951807228916, | |
| "grad_norm": 3.544683408089574, | |
| "kl": 0.083984375, | |
| "learning_rate": 7.168674698795181e-07, | |
| "loss": 0.0034, | |
| "reward": 2.2913438081741333, | |
| "reward_std": 0.26863446831703186, | |
| "rewards/accuracy_reward": 1.3069688081741333, | |
| "rewards/format_reward": 0.984375, | |
| "step": 141 | |
| }, | |
| { | |
| "completion_length": 83.2578125, | |
| "epoch": 1.7108433734939759, | |
| "grad_norm": 4.741927242341381, | |
| "kl": 0.12548828125, | |
| "learning_rate": 7.14859437751004e-07, | |
| "loss": 0.005, | |
| "reward": 2.3960628509521484, | |
| "reward_std": 0.2550785541534424, | |
| "rewards/accuracy_reward": 1.3960627913475037, | |
| "rewards/format_reward": 1.0, | |
| "step": 142 | |
| }, | |
| { | |
| "completion_length": 86.671875, | |
| "epoch": 1.7228915662650603, | |
| "grad_norm": 3.0874349711182494, | |
| "kl": 0.07470703125, | |
| "learning_rate": 7.128514056224899e-07, | |
| "loss": 0.003, | |
| "reward": 2.3813560009002686, | |
| "reward_std": 0.25298502296209335, | |
| "rewards/accuracy_reward": 1.381356120109558, | |
| "rewards/format_reward": 1.0, | |
| "step": 143 | |
| }, | |
| { | |
| "completion_length": 80.40625, | |
| "epoch": 1.7349397590361446, | |
| "grad_norm": 9.215211678123678, | |
| "kl": 0.085693359375, | |
| "learning_rate": 7.108433734939758e-07, | |
| "loss": 0.0034, | |
| "reward": 2.3150322437286377, | |
| "reward_std": 0.23231424391269684, | |
| "rewards/accuracy_reward": 1.315032422542572, | |
| "rewards/format_reward": 1.0, | |
| "step": 144 | |
| }, | |
| { | |
| "completion_length": 79.5859375, | |
| "epoch": 1.7469879518072289, | |
| "grad_norm": 3.3677362414264307, | |
| "kl": 0.098876953125, | |
| "learning_rate": 7.088353413654619e-07, | |
| "loss": 0.0039, | |
| "reward": 2.2901567220687866, | |
| "reward_std": 0.21487458050251007, | |
| "rewards/accuracy_reward": 1.2979693412780762, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 145 | |
| }, | |
| { | |
| "completion_length": 87.2734375, | |
| "epoch": 1.7590361445783134, | |
| "grad_norm": 3.8053306313986037, | |
| "kl": 0.104736328125, | |
| "learning_rate": 7.068273092369477e-07, | |
| "loss": 0.0042, | |
| "reward": 2.2074761390686035, | |
| "reward_std": 0.24223129451274872, | |
| "rewards/accuracy_reward": 1.2074760794639587, | |
| "rewards/format_reward": 1.0, | |
| "step": 146 | |
| }, | |
| { | |
| "completion_length": 88.984375, | |
| "epoch": 1.7710843373493976, | |
| "grad_norm": 4.960937467624004, | |
| "kl": 0.08251953125, | |
| "learning_rate": 7.048192771084337e-07, | |
| "loss": 0.0033, | |
| "reward": 2.2357683181762695, | |
| "reward_std": 0.2608248367905617, | |
| "rewards/accuracy_reward": 1.2435806393623352, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 147 | |
| }, | |
| { | |
| "completion_length": 80.421875, | |
| "epoch": 1.783132530120482, | |
| "grad_norm": 3.5313461555382717, | |
| "kl": 0.106689453125, | |
| "learning_rate": 7.028112449799196e-07, | |
| "loss": 0.0042, | |
| "reward": 2.223365068435669, | |
| "reward_std": 0.20793087780475616, | |
| "rewards/accuracy_reward": 1.2311774492263794, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 148 | |
| }, | |
| { | |
| "completion_length": 81.6328125, | |
| "epoch": 1.7951807228915664, | |
| "grad_norm": 3.917968857756188, | |
| "kl": 0.082763671875, | |
| "learning_rate": 7.008032128514057e-07, | |
| "loss": 0.0033, | |
| "reward": 2.431049346923828, | |
| "reward_std": 0.25210463255643845, | |
| "rewards/accuracy_reward": 1.4310495257377625, | |
| "rewards/format_reward": 1.0, | |
| "step": 149 | |
| }, | |
| { | |
| "completion_length": 82.71875, | |
| "epoch": 1.8072289156626506, | |
| "grad_norm": 3.2751640437820417, | |
| "kl": 0.105224609375, | |
| "learning_rate": 6.987951807228916e-07, | |
| "loss": 0.0042, | |
| "reward": 2.167607069015503, | |
| "reward_std": 0.20023201406002045, | |
| "rewards/accuracy_reward": 1.183232069015503, | |
| "rewards/format_reward": 0.984375, | |
| "step": 150 | |
| }, | |
| { | |
| "completion_length": 80.1015625, | |
| "epoch": 1.819277108433735, | |
| "grad_norm": 3.696030829693263, | |
| "kl": 0.09716796875, | |
| "learning_rate": 6.967871485943774e-07, | |
| "loss": 0.0039, | |
| "reward": 2.545083999633789, | |
| "reward_std": 0.17634352296590805, | |
| "rewards/accuracy_reward": 1.5450841188430786, | |
| "rewards/format_reward": 1.0, | |
| "step": 151 | |
| }, | |
| { | |
| "completion_length": 81.6484375, | |
| "epoch": 1.8313253012048194, | |
| "grad_norm": 5.419229696650584, | |
| "kl": 0.119873046875, | |
| "learning_rate": 6.947791164658634e-07, | |
| "loss": 0.0048, | |
| "reward": 2.144273281097412, | |
| "reward_std": 0.2491978257894516, | |
| "rewards/accuracy_reward": 1.152085781097412, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 152 | |
| }, | |
| { | |
| "completion_length": 77.96875, | |
| "epoch": 1.8433734939759037, | |
| "grad_norm": 34.81233821704641, | |
| "kl": 0.09619140625, | |
| "learning_rate": 6.927710843373493e-07, | |
| "loss": 0.0039, | |
| "reward": 2.4207249879837036, | |
| "reward_std": 0.22066732123494148, | |
| "rewards/accuracy_reward": 1.4207251071929932, | |
| "rewards/format_reward": 1.0, | |
| "step": 153 | |
| }, | |
| { | |
| "completion_length": 81.3984375, | |
| "epoch": 1.855421686746988, | |
| "grad_norm": 4.095705367504911, | |
| "kl": 0.101806640625, | |
| "learning_rate": 6.907630522088354e-07, | |
| "loss": 0.0041, | |
| "reward": 2.160383105278015, | |
| "reward_std": 0.27165083587169647, | |
| "rewards/accuracy_reward": 1.1681956052780151, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 154 | |
| }, | |
| { | |
| "completion_length": 79.78125, | |
| "epoch": 1.8674698795180724, | |
| "grad_norm": 3.0440685644807663, | |
| "kl": 0.11865234375, | |
| "learning_rate": 6.887550200803212e-07, | |
| "loss": 0.0047, | |
| "reward": 2.4971319437026978, | |
| "reward_std": 0.16808781027793884, | |
| "rewards/accuracy_reward": 1.4971320629119873, | |
| "rewards/format_reward": 1.0, | |
| "step": 155 | |
| }, | |
| { | |
| "completion_length": 83.09375, | |
| "epoch": 1.8795180722891565, | |
| "grad_norm": 3.1771226883841206, | |
| "kl": 0.10498046875, | |
| "learning_rate": 6.867469879518072e-07, | |
| "loss": 0.0042, | |
| "reward": 2.1450811624526978, | |
| "reward_std": 0.2694619745016098, | |
| "rewards/accuracy_reward": 1.1450812816619873, | |
| "rewards/format_reward": 1.0, | |
| "step": 156 | |
| }, | |
| { | |
| "completion_length": 81.9453125, | |
| "epoch": 1.891566265060241, | |
| "grad_norm": 3.4230588560037583, | |
| "kl": 0.113525390625, | |
| "learning_rate": 6.847389558232931e-07, | |
| "loss": 0.0045, | |
| "reward": 2.44959032535553, | |
| "reward_std": 0.16196198761463165, | |
| "rewards/accuracy_reward": 1.4574028253555298, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 157 | |
| }, | |
| { | |
| "completion_length": 86.203125, | |
| "epoch": 1.9036144578313254, | |
| "grad_norm": 5.9344079114737, | |
| "kl": 0.1015625, | |
| "learning_rate": 6.827309236947792e-07, | |
| "loss": 0.0041, | |
| "reward": 2.1924350261688232, | |
| "reward_std": 0.1869198903441429, | |
| "rewards/accuracy_reward": 1.1924351453781128, | |
| "rewards/format_reward": 1.0, | |
| "step": 158 | |
| }, | |
| { | |
| "completion_length": 84.7734375, | |
| "epoch": 1.9156626506024095, | |
| "grad_norm": 3.7338258911048707, | |
| "kl": 0.105224609375, | |
| "learning_rate": 6.807228915662651e-07, | |
| "loss": 0.0042, | |
| "reward": 2.298088550567627, | |
| "reward_std": 0.2152806669473648, | |
| "rewards/accuracy_reward": 1.3059011697769165, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 159 | |
| }, | |
| { | |
| "completion_length": 88.2109375, | |
| "epoch": 1.927710843373494, | |
| "grad_norm": 3.2737012532681535, | |
| "kl": 0.124755859375, | |
| "learning_rate": 6.787148594377509e-07, | |
| "loss": 0.005, | |
| "reward": 2.3695740699768066, | |
| "reward_std": 0.300421878695488, | |
| "rewards/accuracy_reward": 1.3930113911628723, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 160 | |
| }, | |
| { | |
| "completion_length": 82.9921875, | |
| "epoch": 1.9397590361445785, | |
| "grad_norm": 14.347253854862437, | |
| "kl": 0.119873046875, | |
| "learning_rate": 6.767068273092369e-07, | |
| "loss": 0.0048, | |
| "reward": 2.306626796722412, | |
| "reward_std": 0.2548489645123482, | |
| "rewards/accuracy_reward": 1.3222516179084778, | |
| "rewards/format_reward": 0.984375, | |
| "step": 161 | |
| }, | |
| { | |
| "completion_length": 87.734375, | |
| "epoch": 1.9518072289156625, | |
| "grad_norm": 3.457686333163172, | |
| "kl": 0.109375, | |
| "learning_rate": 6.746987951807228e-07, | |
| "loss": 0.0044, | |
| "reward": 2.2328758239746094, | |
| "reward_std": 0.28791245073080063, | |
| "rewards/accuracy_reward": 1.2641257643699646, | |
| "rewards/format_reward": 0.96875, | |
| "step": 162 | |
| }, | |
| { | |
| "completion_length": 83.25, | |
| "epoch": 1.963855421686747, | |
| "grad_norm": 4.1768305143971824, | |
| "kl": 0.12353515625, | |
| "learning_rate": 6.726907630522089e-07, | |
| "loss": 0.0049, | |
| "reward": 2.2161502838134766, | |
| "reward_std": 0.25863420963287354, | |
| "rewards/accuracy_reward": 1.2630252242088318, | |
| "rewards/format_reward": 0.953125, | |
| "step": 163 | |
| }, | |
| { | |
| "completion_length": 88.734375, | |
| "epoch": 1.9759036144578315, | |
| "grad_norm": 4.842793088552531, | |
| "kl": 0.105712890625, | |
| "learning_rate": 6.706827309236947e-07, | |
| "loss": 0.0042, | |
| "reward": 2.090719521045685, | |
| "reward_std": 0.25029148161411285, | |
| "rewards/accuracy_reward": 1.1141569316387177, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 164 | |
| }, | |
| { | |
| "completion_length": 86.1953125, | |
| "epoch": 1.9879518072289155, | |
| "grad_norm": 3.657481472750154, | |
| "kl": 0.125244140625, | |
| "learning_rate": 6.686746987951807e-07, | |
| "loss": 0.005, | |
| "reward": 2.2765581607818604, | |
| "reward_std": 0.2915503680706024, | |
| "rewards/accuracy_reward": 1.30780827999115, | |
| "rewards/format_reward": 0.96875, | |
| "step": 165 | |
| }, | |
| { | |
| "completion_length": 92.16666793823242, | |
| "epoch": 2.0, | |
| "grad_norm": 3.6057161188599776, | |
| "kl": 0.125732421875, | |
| "learning_rate": 6.666666666666666e-07, | |
| "loss": 0.0047, | |
| "reward": 2.234604835510254, | |
| "reward_std": 0.2570358142256737, | |
| "rewards/accuracy_reward": 1.2346049845218658, | |
| "rewards/format_reward": 1.0, | |
| "step": 166 | |
| }, | |
| { | |
| "completion_length": 87.1484375, | |
| "epoch": 2.0120481927710845, | |
| "grad_norm": 3.7603470456590564, | |
| "kl": 0.094482421875, | |
| "learning_rate": 6.646586345381526e-07, | |
| "loss": 0.0038, | |
| "reward": 2.2034374475479126, | |
| "reward_std": 0.3387380540370941, | |
| "rewards/accuracy_reward": 1.2112498879432678, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 167 | |
| }, | |
| { | |
| "completion_length": 86.1953125, | |
| "epoch": 2.0240963855421685, | |
| "grad_norm": 4.4381952945033465, | |
| "kl": 0.09765625, | |
| "learning_rate": 6.626506024096386e-07, | |
| "loss": 0.0039, | |
| "reward": 2.222957730293274, | |
| "reward_std": 0.2284381240606308, | |
| "rewards/accuracy_reward": 1.238582730293274, | |
| "rewards/format_reward": 0.984375, | |
| "step": 168 | |
| }, | |
| { | |
| "completion_length": 84.3125, | |
| "epoch": 2.036144578313253, | |
| "grad_norm": 3.399081917667578, | |
| "kl": 0.0966796875, | |
| "learning_rate": 6.606425702811244e-07, | |
| "loss": 0.0039, | |
| "reward": 2.2074966430664062, | |
| "reward_std": 0.2783028930425644, | |
| "rewards/accuracy_reward": 1.2231215238571167, | |
| "rewards/format_reward": 0.984375, | |
| "step": 169 | |
| }, | |
| { | |
| "completion_length": 84.1640625, | |
| "epoch": 2.0481927710843375, | |
| "grad_norm": 3.794821230336393, | |
| "kl": 0.10400390625, | |
| "learning_rate": 6.586345381526104e-07, | |
| "loss": 0.0042, | |
| "reward": 2.2774429321289062, | |
| "reward_std": 0.18755661696195602, | |
| "rewards/accuracy_reward": 1.2774428129196167, | |
| "rewards/format_reward": 1.0, | |
| "step": 170 | |
| }, | |
| { | |
| "completion_length": 84.7421875, | |
| "epoch": 2.0602409638554215, | |
| "grad_norm": 5.41653478361753, | |
| "kl": 0.09130859375, | |
| "learning_rate": 6.566265060240963e-07, | |
| "loss": 0.0036, | |
| "reward": 2.2825827598571777, | |
| "reward_std": 0.20142250508069992, | |
| "rewards/accuracy_reward": 1.2825825810432434, | |
| "rewards/format_reward": 1.0, | |
| "step": 171 | |
| }, | |
| { | |
| "completion_length": 78.421875, | |
| "epoch": 2.072289156626506, | |
| "grad_norm": 4.831319526617051, | |
| "kl": 0.099365234375, | |
| "learning_rate": 6.546184738955824e-07, | |
| "loss": 0.004, | |
| "reward": 2.4247552156448364, | |
| "reward_std": 0.19953592866659164, | |
| "rewards/accuracy_reward": 1.4247552752494812, | |
| "rewards/format_reward": 1.0, | |
| "step": 172 | |
| }, | |
| { | |
| "completion_length": 78.359375, | |
| "epoch": 2.0843373493975905, | |
| "grad_norm": 3.8109915515963038, | |
| "kl": 0.10498046875, | |
| "learning_rate": 6.526104417670682e-07, | |
| "loss": 0.0042, | |
| "reward": 2.3325507640838623, | |
| "reward_std": 0.26026056706905365, | |
| "rewards/accuracy_reward": 1.348175823688507, | |
| "rewards/format_reward": 0.984375, | |
| "step": 173 | |
| }, | |
| { | |
| "completion_length": 79.21875, | |
| "epoch": 2.0963855421686746, | |
| "grad_norm": 4.94758596751216, | |
| "kl": 0.130615234375, | |
| "learning_rate": 6.506024096385541e-07, | |
| "loss": 0.0052, | |
| "reward": 2.3614529371261597, | |
| "reward_std": 0.23941361159086227, | |
| "rewards/accuracy_reward": 1.3614528179168701, | |
| "rewards/format_reward": 1.0, | |
| "step": 174 | |
| }, | |
| { | |
| "completion_length": 80.8984375, | |
| "epoch": 2.108433734939759, | |
| "grad_norm": 4.645980861130919, | |
| "kl": 0.12646484375, | |
| "learning_rate": 6.485943775100401e-07, | |
| "loss": 0.0051, | |
| "reward": 2.148719310760498, | |
| "reward_std": 0.2538711354136467, | |
| "rewards/accuracy_reward": 1.1487191915512085, | |
| "rewards/format_reward": 1.0, | |
| "step": 175 | |
| }, | |
| { | |
| "completion_length": 78.921875, | |
| "epoch": 2.1204819277108435, | |
| "grad_norm": 3.362542245290514, | |
| "kl": 0.090576171875, | |
| "learning_rate": 6.465863453815261e-07, | |
| "loss": 0.0036, | |
| "reward": 2.3466458320617676, | |
| "reward_std": 0.21008533239364624, | |
| "rewards/accuracy_reward": 1.346645712852478, | |
| "rewards/format_reward": 1.0, | |
| "step": 176 | |
| }, | |
| { | |
| "completion_length": 78.5546875, | |
| "epoch": 2.1325301204819276, | |
| "grad_norm": 3.6960106974538585, | |
| "kl": 0.0908203125, | |
| "learning_rate": 6.445783132530121e-07, | |
| "loss": 0.0036, | |
| "reward": 2.4223729372024536, | |
| "reward_std": 0.15239863470196724, | |
| "rewards/accuracy_reward": 1.4223730564117432, | |
| "rewards/format_reward": 1.0, | |
| "step": 177 | |
| }, | |
| { | |
| "completion_length": 76.890625, | |
| "epoch": 2.144578313253012, | |
| "grad_norm": 3.5646239400027913, | |
| "kl": 0.103515625, | |
| "learning_rate": 6.425702811244979e-07, | |
| "loss": 0.0041, | |
| "reward": 2.4388126134872437, | |
| "reward_std": 0.22842204570770264, | |
| "rewards/accuracy_reward": 1.4466250538825989, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 178 | |
| }, | |
| { | |
| "completion_length": 78.796875, | |
| "epoch": 2.1566265060240966, | |
| "grad_norm": 3.531186908359453, | |
| "kl": 0.099609375, | |
| "learning_rate": 6.405622489959839e-07, | |
| "loss": 0.004, | |
| "reward": 2.1039586067199707, | |
| "reward_std": 0.23404612392187119, | |
| "rewards/accuracy_reward": 1.1273961663246155, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 179 | |
| }, | |
| { | |
| "completion_length": 75.75, | |
| "epoch": 2.1686746987951806, | |
| "grad_norm": 5.0096541073452485, | |
| "kl": 0.1015625, | |
| "learning_rate": 6.385542168674698e-07, | |
| "loss": 0.0041, | |
| "reward": 2.374882221221924, | |
| "reward_std": 0.2003496214747429, | |
| "rewards/accuracy_reward": 1.374882161617279, | |
| "rewards/format_reward": 1.0, | |
| "step": 180 | |
| }, | |
| { | |
| "completion_length": 79.9375, | |
| "epoch": 2.180722891566265, | |
| "grad_norm": 3.929802835585037, | |
| "kl": 0.102294921875, | |
| "learning_rate": 6.365461847389559e-07, | |
| "loss": 0.0041, | |
| "reward": 2.4310786724090576, | |
| "reward_std": 0.20660096406936646, | |
| "rewards/accuracy_reward": 1.4310787916183472, | |
| "rewards/format_reward": 1.0, | |
| "step": 181 | |
| }, | |
| { | |
| "completion_length": 80.7578125, | |
| "epoch": 2.1927710843373496, | |
| "grad_norm": 4.226674931816659, | |
| "kl": 0.09619140625, | |
| "learning_rate": 6.345381526104418e-07, | |
| "loss": 0.0038, | |
| "reward": 2.3952780961990356, | |
| "reward_std": 0.2160111963748932, | |
| "rewards/accuracy_reward": 1.3952780961990356, | |
| "rewards/format_reward": 1.0, | |
| "step": 182 | |
| }, | |
| { | |
| "completion_length": 80.484375, | |
| "epoch": 2.2048192771084336, | |
| "grad_norm": 3.463553859166022, | |
| "kl": 0.107421875, | |
| "learning_rate": 6.325301204819276e-07, | |
| "loss": 0.0043, | |
| "reward": 2.3913345336914062, | |
| "reward_std": 0.22311442345380783, | |
| "rewards/accuracy_reward": 1.3991470336914062, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 183 | |
| }, | |
| { | |
| "completion_length": 78.484375, | |
| "epoch": 2.216867469879518, | |
| "grad_norm": 3.9553841913647356, | |
| "kl": 0.08642578125, | |
| "learning_rate": 6.305220883534136e-07, | |
| "loss": 0.0035, | |
| "reward": 2.353707432746887, | |
| "reward_std": 0.2809625118970871, | |
| "rewards/accuracy_reward": 1.3615199327468872, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 184 | |
| }, | |
| { | |
| "completion_length": 86.203125, | |
| "epoch": 2.2289156626506026, | |
| "grad_norm": 6.103835532514207, | |
| "kl": 0.075439453125, | |
| "learning_rate": 6.285140562248996e-07, | |
| "loss": 0.003, | |
| "reward": 2.411812663078308, | |
| "reward_std": 0.17931858450174332, | |
| "rewards/accuracy_reward": 1.411812663078308, | |
| "rewards/format_reward": 1.0, | |
| "step": 185 | |
| }, | |
| { | |
| "completion_length": 77.515625, | |
| "epoch": 2.2409638554216866, | |
| "grad_norm": 3.91857543195832, | |
| "kl": 0.10107421875, | |
| "learning_rate": 6.265060240963856e-07, | |
| "loss": 0.004, | |
| "reward": 2.2299575805664062, | |
| "reward_std": 0.2100789025425911, | |
| "rewards/accuracy_reward": 1.2377700209617615, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 186 | |
| }, | |
| { | |
| "completion_length": 77.09375, | |
| "epoch": 2.253012048192771, | |
| "grad_norm": 3.8592654709883796, | |
| "kl": 0.095947265625, | |
| "learning_rate": 6.244979919678714e-07, | |
| "loss": 0.0038, | |
| "reward": 2.47510826587677, | |
| "reward_std": 0.2556135207414627, | |
| "rewards/accuracy_reward": 1.4829206466674805, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 187 | |
| }, | |
| { | |
| "completion_length": 79.2890625, | |
| "epoch": 2.2650602409638556, | |
| "grad_norm": 6.921774157099546, | |
| "kl": 0.093017578125, | |
| "learning_rate": 6.224899598393574e-07, | |
| "loss": 0.0037, | |
| "reward": 2.3394941091537476, | |
| "reward_std": 0.23163118958473206, | |
| "rewards/accuracy_reward": 1.3394939303398132, | |
| "rewards/format_reward": 1.0, | |
| "step": 188 | |
| }, | |
| { | |
| "completion_length": 79.546875, | |
| "epoch": 2.2771084337349397, | |
| "grad_norm": 5.699992937395376, | |
| "kl": 0.08544921875, | |
| "learning_rate": 6.204819277108434e-07, | |
| "loss": 0.0034, | |
| "reward": 2.330021381378174, | |
| "reward_std": 0.21045994758605957, | |
| "rewards/accuracy_reward": 1.3300212621688843, | |
| "rewards/format_reward": 1.0, | |
| "step": 189 | |
| }, | |
| { | |
| "completion_length": 77.421875, | |
| "epoch": 2.289156626506024, | |
| "grad_norm": 4.425700742489554, | |
| "kl": 0.098388671875, | |
| "learning_rate": 6.184738955823293e-07, | |
| "loss": 0.0039, | |
| "reward": 2.2294440269470215, | |
| "reward_std": 0.21671444922685623, | |
| "rewards/accuracy_reward": 1.2294440865516663, | |
| "rewards/format_reward": 1.0, | |
| "step": 190 | |
| }, | |
| { | |
| "completion_length": 74.6640625, | |
| "epoch": 2.3012048192771086, | |
| "grad_norm": 3.5141288907091783, | |
| "kl": 0.08154296875, | |
| "learning_rate": 6.164658634538153e-07, | |
| "loss": 0.0033, | |
| "reward": 2.417364239692688, | |
| "reward_std": 0.18784678727388382, | |
| "rewards/accuracy_reward": 1.4173641800880432, | |
| "rewards/format_reward": 1.0, | |
| "step": 191 | |
| }, | |
| { | |
| "completion_length": 74.53125, | |
| "epoch": 2.3132530120481927, | |
| "grad_norm": 4.6610918738389095, | |
| "kl": 0.096435546875, | |
| "learning_rate": 6.144578313253011e-07, | |
| "loss": 0.0039, | |
| "reward": 2.4048426151275635, | |
| "reward_std": 0.2764005810022354, | |
| "rewards/accuracy_reward": 1.412655234336853, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 192 | |
| }, | |
| { | |
| "completion_length": 80.8984375, | |
| "epoch": 2.325301204819277, | |
| "grad_norm": 6.933183617809393, | |
| "kl": 0.07861328125, | |
| "learning_rate": 6.124497991967871e-07, | |
| "loss": 0.0031, | |
| "reward": 2.2180745601654053, | |
| "reward_std": 0.2127843052148819, | |
| "rewards/accuracy_reward": 1.21807461977005, | |
| "rewards/format_reward": 1.0, | |
| "step": 193 | |
| }, | |
| { | |
| "completion_length": 80.9296875, | |
| "epoch": 2.337349397590361, | |
| "grad_norm": 4.526116466506062, | |
| "kl": 0.088623046875, | |
| "learning_rate": 6.104417670682731e-07, | |
| "loss": 0.0035, | |
| "reward": 2.2327487468719482, | |
| "reward_std": 0.2369586005806923, | |
| "rewards/accuracy_reward": 1.240561306476593, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 194 | |
| }, | |
| { | |
| "completion_length": 79.8359375, | |
| "epoch": 2.3493975903614457, | |
| "grad_norm": 3.410370565415923, | |
| "kl": 0.09326171875, | |
| "learning_rate": 6.084337349397591e-07, | |
| "loss": 0.0037, | |
| "reward": 2.222264051437378, | |
| "reward_std": 0.26303592324256897, | |
| "rewards/accuracy_reward": 1.230076551437378, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 195 | |
| }, | |
| { | |
| "completion_length": 73.8828125, | |
| "epoch": 2.36144578313253, | |
| "grad_norm": 3.962197046428477, | |
| "kl": 0.103271484375, | |
| "learning_rate": 6.064257028112449e-07, | |
| "loss": 0.0041, | |
| "reward": 2.296523690223694, | |
| "reward_std": 0.370675727725029, | |
| "rewards/accuracy_reward": 1.2965235710144043, | |
| "rewards/format_reward": 1.0, | |
| "step": 196 | |
| }, | |
| { | |
| "completion_length": 74.515625, | |
| "epoch": 2.3734939759036147, | |
| "grad_norm": 3.7849181083166066, | |
| "kl": 0.100341796875, | |
| "learning_rate": 6.044176706827308e-07, | |
| "loss": 0.004, | |
| "reward": 2.1898573637008667, | |
| "reward_std": 0.2903239354491234, | |
| "rewards/accuracy_reward": 1.1898574829101562, | |
| "rewards/format_reward": 1.0, | |
| "step": 197 | |
| }, | |
| { | |
| "completion_length": 71.015625, | |
| "epoch": 2.3855421686746987, | |
| "grad_norm": 4.598411590922377, | |
| "kl": 0.09716796875, | |
| "learning_rate": 6.024096385542169e-07, | |
| "loss": 0.0039, | |
| "reward": 2.3405251502990723, | |
| "reward_std": 0.1668776124715805, | |
| "rewards/accuracy_reward": 1.3405250310897827, | |
| "rewards/format_reward": 1.0, | |
| "step": 198 | |
| }, | |
| { | |
| "completion_length": 72.0234375, | |
| "epoch": 2.397590361445783, | |
| "grad_norm": 4.094960420612339, | |
| "kl": 0.08447265625, | |
| "learning_rate": 6.004016064257028e-07, | |
| "loss": 0.0034, | |
| "reward": 2.2692129611968994, | |
| "reward_std": 0.22979120910167694, | |
| "rewards/accuracy_reward": 1.2848379015922546, | |
| "rewards/format_reward": 0.984375, | |
| "step": 199 | |
| }, | |
| { | |
| "completion_length": 76.34375, | |
| "epoch": 2.4096385542168672, | |
| "grad_norm": 5.228591551586785, | |
| "kl": 0.0771484375, | |
| "learning_rate": 5.983935742971888e-07, | |
| "loss": 0.0031, | |
| "reward": 2.29106342792511, | |
| "reward_std": 0.22756240516901016, | |
| "rewards/accuracy_reward": 1.2910634279251099, | |
| "rewards/format_reward": 1.0, | |
| "step": 200 | |
| }, | |
| { | |
| "completion_length": 79.3828125, | |
| "epoch": 2.4216867469879517, | |
| "grad_norm": 3.532651567007306, | |
| "kl": 0.140869140625, | |
| "learning_rate": 5.963855421686746e-07, | |
| "loss": 0.0056, | |
| "reward": 2.218053698539734, | |
| "reward_std": 0.24822543561458588, | |
| "rewards/accuracy_reward": 1.2180536985397339, | |
| "rewards/format_reward": 1.0, | |
| "step": 201 | |
| }, | |
| { | |
| "completion_length": 76.0, | |
| "epoch": 2.433734939759036, | |
| "grad_norm": 3.316768093202225, | |
| "kl": 0.088134765625, | |
| "learning_rate": 5.943775100401606e-07, | |
| "loss": 0.0035, | |
| "reward": 2.26613187789917, | |
| "reward_std": 0.24750088155269623, | |
| "rewards/accuracy_reward": 1.2739443182945251, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 202 | |
| }, | |
| { | |
| "completion_length": 70.5234375, | |
| "epoch": 2.4457831325301207, | |
| "grad_norm": 9.031966519770473, | |
| "kl": 0.099853515625, | |
| "learning_rate": 5.923694779116466e-07, | |
| "loss": 0.004, | |
| "reward": 2.317081928253174, | |
| "reward_std": 0.24299181252717972, | |
| "rewards/accuracy_reward": 1.3248944282531738, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 203 | |
| }, | |
| { | |
| "completion_length": 72.1484375, | |
| "epoch": 2.4578313253012047, | |
| "grad_norm": 4.923799185057533, | |
| "kl": 0.09716796875, | |
| "learning_rate": 5.903614457831325e-07, | |
| "loss": 0.0039, | |
| "reward": 2.202351689338684, | |
| "reward_std": 0.24287213385105133, | |
| "rewards/accuracy_reward": 1.2023517489433289, | |
| "rewards/format_reward": 1.0, | |
| "step": 204 | |
| }, | |
| { | |
| "completion_length": 75.5390625, | |
| "epoch": 2.4698795180722892, | |
| "grad_norm": 10.424209527328602, | |
| "kl": 0.0849609375, | |
| "learning_rate": 5.883534136546184e-07, | |
| "loss": 0.0034, | |
| "reward": 2.3431246280670166, | |
| "reward_std": 0.21441341936588287, | |
| "rewards/accuracy_reward": 1.3431245684623718, | |
| "rewards/format_reward": 1.0, | |
| "step": 205 | |
| }, | |
| { | |
| "completion_length": 74.1328125, | |
| "epoch": 2.4819277108433733, | |
| "grad_norm": 5.39794558294026, | |
| "kl": 0.08349609375, | |
| "learning_rate": 5.863453815261043e-07, | |
| "loss": 0.0033, | |
| "reward": 2.318004846572876, | |
| "reward_std": 0.1649407297372818, | |
| "rewards/accuracy_reward": 1.3180049657821655, | |
| "rewards/format_reward": 1.0, | |
| "step": 206 | |
| }, | |
| { | |
| "completion_length": 70.828125, | |
| "epoch": 2.4939759036144578, | |
| "grad_norm": 5.651509118393077, | |
| "kl": 0.099609375, | |
| "learning_rate": 5.843373493975904e-07, | |
| "loss": 0.004, | |
| "reward": 2.2745083570480347, | |
| "reward_std": 0.1795399785041809, | |
| "rewards/accuracy_reward": 1.27450829744339, | |
| "rewards/format_reward": 1.0, | |
| "step": 207 | |
| }, | |
| { | |
| "completion_length": 75.1484375, | |
| "epoch": 2.5060240963855422, | |
| "grad_norm": 3.374258945078158, | |
| "kl": 0.099853515625, | |
| "learning_rate": 5.823293172690763e-07, | |
| "loss": 0.004, | |
| "reward": 2.183190941810608, | |
| "reward_std": 0.19665208458900452, | |
| "rewards/accuracy_reward": 1.183190941810608, | |
| "rewards/format_reward": 1.0, | |
| "step": 208 | |
| }, | |
| { | |
| "completion_length": 75.15625, | |
| "epoch": 2.5180722891566267, | |
| "grad_norm": 3.680961209255419, | |
| "kl": 0.085693359375, | |
| "learning_rate": 5.803212851405623e-07, | |
| "loss": 0.0034, | |
| "reward": 2.3783202171325684, | |
| "reward_std": 0.21517369151115417, | |
| "rewards/accuracy_reward": 1.3861328959465027, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 209 | |
| }, | |
| { | |
| "completion_length": 75.890625, | |
| "epoch": 2.5301204819277108, | |
| "grad_norm": 4.203577590596214, | |
| "kl": 0.093017578125, | |
| "learning_rate": 5.783132530120481e-07, | |
| "loss": 0.0037, | |
| "reward": 2.232303738594055, | |
| "reward_std": 0.21822457760572433, | |
| "rewards/accuracy_reward": 1.2401162385940552, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 210 | |
| }, | |
| { | |
| "completion_length": 72.5234375, | |
| "epoch": 2.5421686746987953, | |
| "grad_norm": 5.049709537985753, | |
| "kl": 0.09033203125, | |
| "learning_rate": 5.76305220883534e-07, | |
| "loss": 0.0036, | |
| "reward": 2.3138071298599243, | |
| "reward_std": 0.18903522193431854, | |
| "rewards/accuracy_reward": 1.3138071298599243, | |
| "rewards/format_reward": 1.0, | |
| "step": 211 | |
| }, | |
| { | |
| "completion_length": 77.6796875, | |
| "epoch": 2.5542168674698793, | |
| "grad_norm": 4.79270453347689, | |
| "kl": 0.10791015625, | |
| "learning_rate": 5.742971887550201e-07, | |
| "loss": 0.0043, | |
| "reward": 2.35454523563385, | |
| "reward_std": 0.260717436671257, | |
| "rewards/accuracy_reward": 1.36235773563385, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 212 | |
| }, | |
| { | |
| "completion_length": 75.5234375, | |
| "epoch": 2.566265060240964, | |
| "grad_norm": 3.8110594359613694, | |
| "kl": 0.132080078125, | |
| "learning_rate": 5.72289156626506e-07, | |
| "loss": 0.0053, | |
| "reward": 2.3396618366241455, | |
| "reward_std": 0.2776957154273987, | |
| "rewards/accuracy_reward": 1.3474743366241455, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 213 | |
| }, | |
| { | |
| "completion_length": 78.8203125, | |
| "epoch": 2.5783132530120483, | |
| "grad_norm": 3.5277793226603467, | |
| "kl": 0.082763671875, | |
| "learning_rate": 5.70281124497992e-07, | |
| "loss": 0.0033, | |
| "reward": 2.282657027244568, | |
| "reward_std": 0.20082392543554306, | |
| "rewards/accuracy_reward": 1.2826570868492126, | |
| "rewards/format_reward": 1.0, | |
| "step": 214 | |
| }, | |
| { | |
| "completion_length": 79.7265625, | |
| "epoch": 2.5903614457831328, | |
| "grad_norm": 5.661825173466666, | |
| "kl": 0.070068359375, | |
| "learning_rate": 5.682730923694778e-07, | |
| "loss": 0.0028, | |
| "reward": 2.2916386127471924, | |
| "reward_std": 0.22843700647354126, | |
| "rewards/accuracy_reward": 1.2916386723518372, | |
| "rewards/format_reward": 1.0, | |
| "step": 215 | |
| }, | |
| { | |
| "completion_length": 75.484375, | |
| "epoch": 2.602409638554217, | |
| "grad_norm": 5.408656767411551, | |
| "kl": 0.074951171875, | |
| "learning_rate": 5.662650602409639e-07, | |
| "loss": 0.003, | |
| "reward": 2.4862678050994873, | |
| "reward_std": 0.17430586367845535, | |
| "rewards/accuracy_reward": 1.4862679243087769, | |
| "rewards/format_reward": 1.0, | |
| "step": 216 | |
| }, | |
| { | |
| "completion_length": 75.4140625, | |
| "epoch": 2.6144578313253013, | |
| "grad_norm": 4.437169209890788, | |
| "kl": 0.1123046875, | |
| "learning_rate": 5.642570281124498e-07, | |
| "loss": 0.0045, | |
| "reward": 2.2881970405578613, | |
| "reward_std": 0.24159938842058182, | |
| "rewards/accuracy_reward": 1.3116344809532166, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 217 | |
| }, | |
| { | |
| "completion_length": 77.1484375, | |
| "epoch": 2.6265060240963853, | |
| "grad_norm": 3.7017405154535608, | |
| "kl": 0.0849609375, | |
| "learning_rate": 5.622489959839358e-07, | |
| "loss": 0.0034, | |
| "reward": 2.42057728767395, | |
| "reward_std": 0.1918034851551056, | |
| "rewards/accuracy_reward": 1.4205771684646606, | |
| "rewards/format_reward": 1.0, | |
| "step": 218 | |
| }, | |
| { | |
| "completion_length": 74.9921875, | |
| "epoch": 2.63855421686747, | |
| "grad_norm": 3.0572748613034184, | |
| "kl": 0.08056640625, | |
| "learning_rate": 5.602409638554216e-07, | |
| "loss": 0.0032, | |
| "reward": 2.296902298927307, | |
| "reward_std": 0.22776726633310318, | |
| "rewards/accuracy_reward": 1.2969022989273071, | |
| "rewards/format_reward": 1.0, | |
| "step": 219 | |
| }, | |
| { | |
| "completion_length": 77.9375, | |
| "epoch": 2.6506024096385543, | |
| "grad_norm": 5.142063259050984, | |
| "kl": 0.08251953125, | |
| "learning_rate": 5.582329317269075e-07, | |
| "loss": 0.0033, | |
| "reward": 2.411815643310547, | |
| "reward_std": 0.20656804740428925, | |
| "rewards/accuracy_reward": 1.4118155241012573, | |
| "rewards/format_reward": 1.0, | |
| "step": 220 | |
| }, | |
| { | |
| "completion_length": 75.0625, | |
| "epoch": 2.662650602409639, | |
| "grad_norm": 9.244315362233946, | |
| "kl": 0.094482421875, | |
| "learning_rate": 5.562248995983936e-07, | |
| "loss": 0.0038, | |
| "reward": 2.2525359392166138, | |
| "reward_std": 0.23683273047208786, | |
| "rewards/accuracy_reward": 1.2681609392166138, | |
| "rewards/format_reward": 0.984375, | |
| "step": 221 | |
| }, | |
| { | |
| "completion_length": 78.390625, | |
| "epoch": 2.674698795180723, | |
| "grad_norm": 4.89406748105177, | |
| "kl": 0.078125, | |
| "learning_rate": 5.542168674698795e-07, | |
| "loss": 0.0031, | |
| "reward": 2.33753764629364, | |
| "reward_std": 0.21247170120477676, | |
| "rewards/accuracy_reward": 1.3453501462936401, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 222 | |
| }, | |
| { | |
| "completion_length": 73.0859375, | |
| "epoch": 2.6867469879518073, | |
| "grad_norm": 3.6393688137680464, | |
| "kl": 0.0810546875, | |
| "learning_rate": 5.522088353413655e-07, | |
| "loss": 0.0032, | |
| "reward": 2.2808330059051514, | |
| "reward_std": 0.1841505616903305, | |
| "rewards/accuracy_reward": 1.280833125114441, | |
| "rewards/format_reward": 1.0, | |
| "step": 223 | |
| }, | |
| { | |
| "completion_length": 77.1484375, | |
| "epoch": 2.6987951807228914, | |
| "grad_norm": 2.9614100491209516, | |
| "kl": 0.08447265625, | |
| "learning_rate": 5.502008032128513e-07, | |
| "loss": 0.0034, | |
| "reward": 2.256025791168213, | |
| "reward_std": 0.22689195722341537, | |
| "rewards/accuracy_reward": 1.271650791168213, | |
| "rewards/format_reward": 0.984375, | |
| "step": 224 | |
| }, | |
| { | |
| "completion_length": 72.6015625, | |
| "epoch": 2.710843373493976, | |
| "grad_norm": 4.624802749562738, | |
| "kl": 0.0810546875, | |
| "learning_rate": 5.481927710843374e-07, | |
| "loss": 0.0032, | |
| "reward": 2.367666721343994, | |
| "reward_std": 0.20605457574129105, | |
| "rewards/accuracy_reward": 1.367666482925415, | |
| "rewards/format_reward": 1.0, | |
| "step": 225 | |
| }, | |
| { | |
| "completion_length": 70.859375, | |
| "epoch": 2.7228915662650603, | |
| "grad_norm": 6.0943428059060505, | |
| "kl": 0.10205078125, | |
| "learning_rate": 5.461847389558233e-07, | |
| "loss": 0.0041, | |
| "reward": 2.3246583938598633, | |
| "reward_std": 0.17254704982042313, | |
| "rewards/accuracy_reward": 1.3324708938598633, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 226 | |
| }, | |
| { | |
| "completion_length": 75.640625, | |
| "epoch": 2.734939759036145, | |
| "grad_norm": 4.26546660385252, | |
| "kl": 0.090087890625, | |
| "learning_rate": 5.441767068273092e-07, | |
| "loss": 0.0036, | |
| "reward": 2.307809591293335, | |
| "reward_std": 0.2002812698483467, | |
| "rewards/accuracy_reward": 1.315622091293335, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 227 | |
| }, | |
| { | |
| "completion_length": 73.671875, | |
| "epoch": 2.746987951807229, | |
| "grad_norm": 3.4690497244218435, | |
| "kl": 0.0927734375, | |
| "learning_rate": 5.421686746987951e-07, | |
| "loss": 0.0037, | |
| "reward": 2.4064533710479736, | |
| "reward_std": 0.1763758659362793, | |
| "rewards/accuracy_reward": 1.4142658710479736, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 228 | |
| }, | |
| { | |
| "completion_length": 77.265625, | |
| "epoch": 2.7590361445783134, | |
| "grad_norm": 3.8015660942675313, | |
| "kl": 0.107666015625, | |
| "learning_rate": 5.401606425702811e-07, | |
| "loss": 0.0043, | |
| "reward": 2.417749524116516, | |
| "reward_std": 0.20080577582120895, | |
| "rewards/accuracy_reward": 1.4333745837211609, | |
| "rewards/format_reward": 0.984375, | |
| "step": 229 | |
| }, | |
| { | |
| "completion_length": 78.6484375, | |
| "epoch": 2.7710843373493974, | |
| "grad_norm": 4.593078230781537, | |
| "kl": 0.081298828125, | |
| "learning_rate": 5.381526104417671e-07, | |
| "loss": 0.0032, | |
| "reward": 2.310904383659363, | |
| "reward_std": 0.20601534098386765, | |
| "rewards/accuracy_reward": 1.326529324054718, | |
| "rewards/format_reward": 0.984375, | |
| "step": 230 | |
| }, | |
| { | |
| "completion_length": 69.75, | |
| "epoch": 2.783132530120482, | |
| "grad_norm": 4.781119598148597, | |
| "kl": 0.092041015625, | |
| "learning_rate": 5.36144578313253e-07, | |
| "loss": 0.0037, | |
| "reward": 2.4060455560684204, | |
| "reward_std": 0.1945626586675644, | |
| "rewards/accuracy_reward": 1.41385817527771, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 231 | |
| }, | |
| { | |
| "completion_length": 72.125, | |
| "epoch": 2.7951807228915664, | |
| "grad_norm": 3.6431689651666925, | |
| "kl": 0.084716796875, | |
| "learning_rate": 5.34136546184739e-07, | |
| "loss": 0.0034, | |
| "reward": 2.2687569856643677, | |
| "reward_std": 0.20781449228525162, | |
| "rewards/accuracy_reward": 1.2765693664550781, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 232 | |
| }, | |
| { | |
| "completion_length": 75.28125, | |
| "epoch": 2.807228915662651, | |
| "grad_norm": 3.463525581618983, | |
| "kl": 0.0830078125, | |
| "learning_rate": 5.321285140562248e-07, | |
| "loss": 0.0033, | |
| "reward": 2.2786985635757446, | |
| "reward_std": 0.1869373545050621, | |
| "rewards/accuracy_reward": 1.2865110039710999, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 233 | |
| }, | |
| { | |
| "completion_length": 72.390625, | |
| "epoch": 2.819277108433735, | |
| "grad_norm": 3.989550051539227, | |
| "kl": 0.08935546875, | |
| "learning_rate": 5.301204819277109e-07, | |
| "loss": 0.0036, | |
| "reward": 2.2122349739074707, | |
| "reward_std": 0.17366793006658554, | |
| "rewards/accuracy_reward": 1.212234914302826, | |
| "rewards/format_reward": 1.0, | |
| "step": 234 | |
| }, | |
| { | |
| "completion_length": 68.4296875, | |
| "epoch": 2.8313253012048194, | |
| "grad_norm": 5.293732432179004, | |
| "kl": 0.1162109375, | |
| "learning_rate": 5.281124497991968e-07, | |
| "loss": 0.0046, | |
| "reward": 2.273004412651062, | |
| "reward_std": 0.21551835536956787, | |
| "rewards/accuracy_reward": 1.2730044722557068, | |
| "rewards/format_reward": 1.0, | |
| "step": 235 | |
| }, | |
| { | |
| "completion_length": 70.4765625, | |
| "epoch": 2.8433734939759034, | |
| "grad_norm": 3.483964465031993, | |
| "kl": 0.08642578125, | |
| "learning_rate": 5.261044176706827e-07, | |
| "loss": 0.0035, | |
| "reward": 2.5097464323043823, | |
| "reward_std": 0.21660751849412918, | |
| "rewards/accuracy_reward": 1.509746491909027, | |
| "rewards/format_reward": 1.0, | |
| "step": 236 | |
| }, | |
| { | |
| "completion_length": 67.1796875, | |
| "epoch": 2.855421686746988, | |
| "grad_norm": 3.2613871176315286, | |
| "kl": 0.109619140625, | |
| "learning_rate": 5.240963855421686e-07, | |
| "loss": 0.0044, | |
| "reward": 2.2154468297958374, | |
| "reward_std": 0.2426525428891182, | |
| "rewards/accuracy_reward": 1.2154468894004822, | |
| "rewards/format_reward": 1.0, | |
| "step": 237 | |
| }, | |
| { | |
| "completion_length": 73.875, | |
| "epoch": 2.8674698795180724, | |
| "grad_norm": 5.04569953866162, | |
| "kl": 0.105224609375, | |
| "learning_rate": 5.220883534136546e-07, | |
| "loss": 0.0042, | |
| "reward": 2.3947439193725586, | |
| "reward_std": 0.16551193594932556, | |
| "rewards/accuracy_reward": 1.3947439193725586, | |
| "rewards/format_reward": 1.0, | |
| "step": 238 | |
| }, | |
| { | |
| "completion_length": 70.03125, | |
| "epoch": 2.8795180722891565, | |
| "grad_norm": 3.2080049289623997, | |
| "kl": 0.10986328125, | |
| "learning_rate": 5.200803212851406e-07, | |
| "loss": 0.0044, | |
| "reward": 2.394848346710205, | |
| "reward_std": 0.22504138201475143, | |
| "rewards/accuracy_reward": 1.394848346710205, | |
| "rewards/format_reward": 1.0, | |
| "step": 239 | |
| }, | |
| { | |
| "completion_length": 70.90625, | |
| "epoch": 2.891566265060241, | |
| "grad_norm": 3.843192487462901, | |
| "kl": 0.1171875, | |
| "learning_rate": 5.180722891566265e-07, | |
| "loss": 0.0047, | |
| "reward": 2.2219191789627075, | |
| "reward_std": 0.2526251822710037, | |
| "rewards/accuracy_reward": 1.2219191193580627, | |
| "rewards/format_reward": 1.0, | |
| "step": 240 | |
| }, | |
| { | |
| "completion_length": 67.1328125, | |
| "epoch": 2.9036144578313254, | |
| "grad_norm": 3.0217979987505394, | |
| "kl": 0.104248046875, | |
| "learning_rate": 5.160642570281125e-07, | |
| "loss": 0.0042, | |
| "reward": 2.2357059717178345, | |
| "reward_std": 0.181558758020401, | |
| "rewards/accuracy_reward": 1.235705852508545, | |
| "rewards/format_reward": 1.0, | |
| "step": 241 | |
| }, | |
| { | |
| "completion_length": 67.0390625, | |
| "epoch": 2.9156626506024095, | |
| "grad_norm": 4.171949473201647, | |
| "kl": 0.1044921875, | |
| "learning_rate": 5.140562248995983e-07, | |
| "loss": 0.0042, | |
| "reward": 2.3148874044418335, | |
| "reward_std": 0.17748098075389862, | |
| "rewards/accuracy_reward": 1.3148874640464783, | |
| "rewards/format_reward": 1.0, | |
| "step": 242 | |
| }, | |
| { | |
| "completion_length": 65.8671875, | |
| "epoch": 2.927710843373494, | |
| "grad_norm": 8.908769866071971, | |
| "kl": 0.11181640625, | |
| "learning_rate": 5.120481927710843e-07, | |
| "loss": 0.0045, | |
| "reward": 2.2218422889709473, | |
| "reward_std": 0.1961566060781479, | |
| "rewards/accuracy_reward": 1.2296549081802368, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 243 | |
| }, | |
| { | |
| "completion_length": 63.6953125, | |
| "epoch": 2.9397590361445785, | |
| "grad_norm": 12.929344924116855, | |
| "kl": 0.106201171875, | |
| "learning_rate": 5.100401606425703e-07, | |
| "loss": 0.0042, | |
| "reward": 2.4831990003585815, | |
| "reward_std": 0.17936265468597412, | |
| "rewards/accuracy_reward": 1.4831989407539368, | |
| "rewards/format_reward": 1.0, | |
| "step": 244 | |
| }, | |
| { | |
| "completion_length": 62.28125, | |
| "epoch": 2.9518072289156625, | |
| "grad_norm": 3.4705083145900404, | |
| "kl": 0.111328125, | |
| "learning_rate": 5.080321285140562e-07, | |
| "loss": 0.0044, | |
| "reward": 2.352734327316284, | |
| "reward_std": 0.2174607664346695, | |
| "rewards/accuracy_reward": 1.3683592081069946, | |
| "rewards/format_reward": 0.984375, | |
| "step": 245 | |
| }, | |
| { | |
| "completion_length": 69.640625, | |
| "epoch": 2.963855421686747, | |
| "grad_norm": 4.178352503452598, | |
| "kl": 0.111572265625, | |
| "learning_rate": 5.060240963855421e-07, | |
| "loss": 0.0045, | |
| "reward": 2.3825145959854126, | |
| "reward_std": 0.21491926908493042, | |
| "rewards/accuracy_reward": 1.3903270959854126, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 246 | |
| }, | |
| { | |
| "completion_length": 65.875, | |
| "epoch": 2.9759036144578315, | |
| "grad_norm": 4.426857679190133, | |
| "kl": 0.149169921875, | |
| "learning_rate": 5.040160642570281e-07, | |
| "loss": 0.006, | |
| "reward": 2.1721856594085693, | |
| "reward_std": 0.2390434294939041, | |
| "rewards/accuracy_reward": 1.1721857190132141, | |
| "rewards/format_reward": 1.0, | |
| "step": 247 | |
| }, | |
| { | |
| "completion_length": 70.9921875, | |
| "epoch": 2.9879518072289155, | |
| "grad_norm": 4.720913912936636, | |
| "kl": 0.114013671875, | |
| "learning_rate": 5.020080321285141e-07, | |
| "loss": 0.0046, | |
| "reward": 2.2051347494125366, | |
| "reward_std": 0.2722553163766861, | |
| "rewards/accuracy_reward": 1.2285721898078918, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 248 | |
| }, | |
| { | |
| "completion_length": 64.25000190734863, | |
| "epoch": 3.0, | |
| "grad_norm": 3.5181266600609904, | |
| "kl": 0.11962890625, | |
| "learning_rate": 5e-07, | |
| "loss": 0.0048, | |
| "reward": 2.1161320209503174, | |
| "reward_std": 0.430472195148468, | |
| "rewards/accuracy_reward": 1.1994653940200806, | |
| "rewards/format_reward": 0.9166666865348816, | |
| "step": 249 | |
| }, | |
| { | |
| "completion_length": 68.1875, | |
| "epoch": 3.0120481927710845, | |
| "grad_norm": 3.5431810235066643, | |
| "kl": 0.09619140625, | |
| "learning_rate": 4.979919678714859e-07, | |
| "loss": 0.0038, | |
| "reward": 2.323817491531372, | |
| "reward_std": 0.23299024999141693, | |
| "rewards/accuracy_reward": 1.3316298723220825, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 250 | |
| }, | |
| { | |
| "completion_length": 71.6953125, | |
| "epoch": 3.0240963855421685, | |
| "grad_norm": 3.3542739826451173, | |
| "kl": 0.08642578125, | |
| "learning_rate": 4.959839357429718e-07, | |
| "loss": 0.0035, | |
| "reward": 2.411439895629883, | |
| "reward_std": 0.19917739927768707, | |
| "rewards/accuracy_reward": 1.4114398956298828, | |
| "rewards/format_reward": 1.0, | |
| "step": 251 | |
| }, | |
| { | |
| "completion_length": 68.109375, | |
| "epoch": 3.036144578313253, | |
| "grad_norm": 12.151823073672764, | |
| "kl": 0.110107421875, | |
| "learning_rate": 4.939759036144578e-07, | |
| "loss": 0.0044, | |
| "reward": 2.5318474769592285, | |
| "reward_std": 0.18056734651327133, | |
| "rewards/accuracy_reward": 1.5396599173545837, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 252 | |
| }, | |
| { | |
| "completion_length": 72.578125, | |
| "epoch": 3.0481927710843375, | |
| "grad_norm": 3.219943316402962, | |
| "kl": 0.099853515625, | |
| "learning_rate": 4.919678714859438e-07, | |
| "loss": 0.004, | |
| "reward": 2.3200578689575195, | |
| "reward_std": 0.15618911385536194, | |
| "rewards/accuracy_reward": 1.3200578689575195, | |
| "rewards/format_reward": 1.0, | |
| "step": 253 | |
| }, | |
| { | |
| "completion_length": 61.3828125, | |
| "epoch": 3.0602409638554215, | |
| "grad_norm": 3.865556225897638, | |
| "kl": 0.10888671875, | |
| "learning_rate": 4.899598393574297e-07, | |
| "loss": 0.0044, | |
| "reward": 2.209138035774231, | |
| "reward_std": 0.17473262548446655, | |
| "rewards/accuracy_reward": 1.2091379761695862, | |
| "rewards/format_reward": 1.0, | |
| "step": 254 | |
| }, | |
| { | |
| "completion_length": 66.7421875, | |
| "epoch": 3.072289156626506, | |
| "grad_norm": 4.017362101946035, | |
| "kl": 0.1259765625, | |
| "learning_rate": 4.879518072289156e-07, | |
| "loss": 0.005, | |
| "reward": 2.139701724052429, | |
| "reward_std": 0.22376088798046112, | |
| "rewards/accuracy_reward": 1.1397016048431396, | |
| "rewards/format_reward": 1.0, | |
| "step": 255 | |
| }, | |
| { | |
| "completion_length": 62.71875, | |
| "epoch": 3.0843373493975905, | |
| "grad_norm": 3.4288754746391947, | |
| "kl": 0.140625, | |
| "learning_rate": 4.859437751004016e-07, | |
| "loss": 0.0056, | |
| "reward": 2.2105259895324707, | |
| "reward_std": 0.22984497249126434, | |
| "rewards/accuracy_reward": 1.2261508703231812, | |
| "rewards/format_reward": 0.984375, | |
| "step": 256 | |
| }, | |
| { | |
| "completion_length": 66.6953125, | |
| "epoch": 3.0963855421686746, | |
| "grad_norm": 3.481985490355864, | |
| "kl": 0.1181640625, | |
| "learning_rate": 4.839357429718875e-07, | |
| "loss": 0.0047, | |
| "reward": 2.5049203634262085, | |
| "reward_std": 0.1857297122478485, | |
| "rewards/accuracy_reward": 1.5049203634262085, | |
| "rewards/format_reward": 1.0, | |
| "step": 257 | |
| }, | |
| { | |
| "completion_length": 67.484375, | |
| "epoch": 3.108433734939759, | |
| "grad_norm": 3.6977753194922403, | |
| "kl": 0.107666015625, | |
| "learning_rate": 4.819277108433735e-07, | |
| "loss": 0.0043, | |
| "reward": 2.3002774715423584, | |
| "reward_std": 0.21863283962011337, | |
| "rewards/accuracy_reward": 1.3080899119377136, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 258 | |
| }, | |
| { | |
| "completion_length": 71.984375, | |
| "epoch": 3.1204819277108435, | |
| "grad_norm": 3.2391554999759054, | |
| "kl": 0.099853515625, | |
| "learning_rate": 4.799196787148594e-07, | |
| "loss": 0.004, | |
| "reward": 2.404132843017578, | |
| "reward_std": 0.19443362206220627, | |
| "rewards/accuracy_reward": 1.4119452238082886, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 259 | |
| }, | |
| { | |
| "completion_length": 70.3984375, | |
| "epoch": 3.1325301204819276, | |
| "grad_norm": 3.8470897735347993, | |
| "kl": 0.11181640625, | |
| "learning_rate": 4.779116465863453e-07, | |
| "loss": 0.0045, | |
| "reward": 2.2314306497573853, | |
| "reward_std": 0.1860732138156891, | |
| "rewards/accuracy_reward": 1.2392430305480957, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 260 | |
| }, | |
| { | |
| "completion_length": 71.7109375, | |
| "epoch": 3.144578313253012, | |
| "grad_norm": 5.7256880192839965, | |
| "kl": 0.101806640625, | |
| "learning_rate": 4.7590361445783126e-07, | |
| "loss": 0.0041, | |
| "reward": 2.3397083282470703, | |
| "reward_std": 0.21985551714897156, | |
| "rewards/accuracy_reward": 1.3397083282470703, | |
| "rewards/format_reward": 1.0, | |
| "step": 261 | |
| }, | |
| { | |
| "completion_length": 72.7265625, | |
| "epoch": 3.1566265060240966, | |
| "grad_norm": 4.6788843643036255, | |
| "kl": 0.183837890625, | |
| "learning_rate": 4.7389558232931724e-07, | |
| "loss": 0.0074, | |
| "reward": 2.288654088973999, | |
| "reward_std": 0.25063957273960114, | |
| "rewards/accuracy_reward": 1.296466588973999, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 262 | |
| }, | |
| { | |
| "completion_length": 66.96875, | |
| "epoch": 3.1686746987951806, | |
| "grad_norm": 4.000735227178484, | |
| "kl": 0.1171875, | |
| "learning_rate": 4.7188755020080317e-07, | |
| "loss": 0.0047, | |
| "reward": 2.385547637939453, | |
| "reward_std": 0.179743941873312, | |
| "rewards/accuracy_reward": 1.393360197544098, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 263 | |
| }, | |
| { | |
| "completion_length": 73.078125, | |
| "epoch": 3.180722891566265, | |
| "grad_norm": 3.2436175706744903, | |
| "kl": 0.08837890625, | |
| "learning_rate": 4.6987951807228915e-07, | |
| "loss": 0.0035, | |
| "reward": 2.3714927434921265, | |
| "reward_std": 0.1866167113184929, | |
| "rewards/accuracy_reward": 1.3793052434921265, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 264 | |
| }, | |
| { | |
| "completion_length": 67.7578125, | |
| "epoch": 3.1927710843373496, | |
| "grad_norm": 4.16773338040152, | |
| "kl": 0.09619140625, | |
| "learning_rate": 4.678714859437751e-07, | |
| "loss": 0.0038, | |
| "reward": 2.256360650062561, | |
| "reward_std": 0.2188187688589096, | |
| "rewards/accuracy_reward": 1.256360650062561, | |
| "rewards/format_reward": 1.0, | |
| "step": 265 | |
| }, | |
| { | |
| "completion_length": 71.6796875, | |
| "epoch": 3.2048192771084336, | |
| "grad_norm": 3.7554898641141388, | |
| "kl": 0.094482421875, | |
| "learning_rate": 4.6586345381526106e-07, | |
| "loss": 0.0038, | |
| "reward": 2.285356283187866, | |
| "reward_std": 0.2733229324221611, | |
| "rewards/accuracy_reward": 1.2853562831878662, | |
| "rewards/format_reward": 1.0, | |
| "step": 266 | |
| }, | |
| { | |
| "completion_length": 69.53125, | |
| "epoch": 3.216867469879518, | |
| "grad_norm": 3.1396081677261747, | |
| "kl": 0.11572265625, | |
| "learning_rate": 4.63855421686747e-07, | |
| "loss": 0.0046, | |
| "reward": 2.194140672683716, | |
| "reward_std": 0.2116081416606903, | |
| "rewards/accuracy_reward": 1.1941407322883606, | |
| "rewards/format_reward": 1.0, | |
| "step": 267 | |
| }, | |
| { | |
| "completion_length": 67.8203125, | |
| "epoch": 3.2289156626506026, | |
| "grad_norm": 7.260439555595242, | |
| "kl": 0.08837890625, | |
| "learning_rate": 4.6184738955823296e-07, | |
| "loss": 0.0035, | |
| "reward": 2.252182364463806, | |
| "reward_std": 0.1803755983710289, | |
| "rewards/accuracy_reward": 1.259994924068451, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 268 | |
| }, | |
| { | |
| "completion_length": 67.390625, | |
| "epoch": 3.2409638554216866, | |
| "grad_norm": 3.5049860895757696, | |
| "kl": 0.08935546875, | |
| "learning_rate": 4.5983935742971884e-07, | |
| "loss": 0.0036, | |
| "reward": 2.2208237648010254, | |
| "reward_std": 0.23105446994304657, | |
| "rewards/accuracy_reward": 1.2286362648010254, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 269 | |
| }, | |
| { | |
| "completion_length": 70.8515625, | |
| "epoch": 3.253012048192771, | |
| "grad_norm": 5.489156591080696, | |
| "kl": 0.131591796875, | |
| "learning_rate": 4.5783132530120476e-07, | |
| "loss": 0.0053, | |
| "reward": 2.2373805046081543, | |
| "reward_std": 0.2680865153670311, | |
| "rewards/accuracy_reward": 1.2373805046081543, | |
| "rewards/format_reward": 1.0, | |
| "step": 270 | |
| }, | |
| { | |
| "completion_length": 67.3359375, | |
| "epoch": 3.2650602409638556, | |
| "grad_norm": 3.943203757539833, | |
| "kl": 0.102783203125, | |
| "learning_rate": 4.5582329317269074e-07, | |
| "loss": 0.0041, | |
| "reward": 2.2856905460357666, | |
| "reward_std": 0.2643607556819916, | |
| "rewards/accuracy_reward": 1.2856906652450562, | |
| "rewards/format_reward": 1.0, | |
| "step": 271 | |
| }, | |
| { | |
| "completion_length": 76.703125, | |
| "epoch": 3.2771084337349397, | |
| "grad_norm": 4.067837029288379, | |
| "kl": 0.14794921875, | |
| "learning_rate": 4.5381526104417667e-07, | |
| "loss": 0.0059, | |
| "reward": 2.2173361778259277, | |
| "reward_std": 0.23457611352205276, | |
| "rewards/accuracy_reward": 1.2251486778259277, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 272 | |
| }, | |
| { | |
| "completion_length": 70.9765625, | |
| "epoch": 3.289156626506024, | |
| "grad_norm": 3.356513487854019, | |
| "kl": 0.105712890625, | |
| "learning_rate": 4.5180722891566265e-07, | |
| "loss": 0.0042, | |
| "reward": 2.3274762630462646, | |
| "reward_std": 0.1404755339026451, | |
| "rewards/accuracy_reward": 1.327476143836975, | |
| "rewards/format_reward": 1.0, | |
| "step": 273 | |
| }, | |
| { | |
| "completion_length": 73.5546875, | |
| "epoch": 3.3012048192771086, | |
| "grad_norm": 2.8662666869018194, | |
| "kl": 0.087646484375, | |
| "learning_rate": 4.497991967871486e-07, | |
| "loss": 0.0035, | |
| "reward": 2.4234249591827393, | |
| "reward_std": 0.23345230519771576, | |
| "rewards/accuracy_reward": 1.4234249591827393, | |
| "rewards/format_reward": 1.0, | |
| "step": 274 | |
| }, | |
| { | |
| "completion_length": 76.2890625, | |
| "epoch": 3.3132530120481927, | |
| "grad_norm": 3.6359732134875027, | |
| "kl": 0.0849609375, | |
| "learning_rate": 4.4779116465863456e-07, | |
| "loss": 0.0034, | |
| "reward": 2.2799594402313232, | |
| "reward_std": 0.17667143046855927, | |
| "rewards/accuracy_reward": 1.2799595594406128, | |
| "rewards/format_reward": 1.0, | |
| "step": 275 | |
| }, | |
| { | |
| "completion_length": 74.9296875, | |
| "epoch": 3.325301204819277, | |
| "grad_norm": 3.4769457078888513, | |
| "kl": 0.1181640625, | |
| "learning_rate": 4.4578313253012043e-07, | |
| "loss": 0.0047, | |
| "reward": 2.282673478126526, | |
| "reward_std": 0.20452508330345154, | |
| "rewards/accuracy_reward": 1.282673418521881, | |
| "rewards/format_reward": 1.0, | |
| "step": 276 | |
| }, | |
| { | |
| "completion_length": 73.828125, | |
| "epoch": 3.337349397590361, | |
| "grad_norm": 5.230024279024117, | |
| "kl": 0.0830078125, | |
| "learning_rate": 4.437751004016064e-07, | |
| "loss": 0.0033, | |
| "reward": 2.2097089290618896, | |
| "reward_std": 0.22180304676294327, | |
| "rewards/accuracy_reward": 1.2097087502479553, | |
| "rewards/format_reward": 1.0, | |
| "step": 277 | |
| }, | |
| { | |
| "completion_length": 72.7109375, | |
| "epoch": 3.3493975903614457, | |
| "grad_norm": 3.8728422379908416, | |
| "kl": 0.095458984375, | |
| "learning_rate": 4.4176706827309234e-07, | |
| "loss": 0.0038, | |
| "reward": 2.491241931915283, | |
| "reward_std": 0.22739917039871216, | |
| "rewards/accuracy_reward": 1.4912420511245728, | |
| "rewards/format_reward": 1.0, | |
| "step": 278 | |
| }, | |
| { | |
| "completion_length": 78.5078125, | |
| "epoch": 3.36144578313253, | |
| "grad_norm": 3.6858021846036535, | |
| "kl": 0.0908203125, | |
| "learning_rate": 4.3975903614457827e-07, | |
| "loss": 0.0036, | |
| "reward": 2.243127226829529, | |
| "reward_std": 0.22939348965883255, | |
| "rewards/accuracy_reward": 1.2431272268295288, | |
| "rewards/format_reward": 1.0, | |
| "step": 279 | |
| }, | |
| { | |
| "completion_length": 72.765625, | |
| "epoch": 3.3734939759036147, | |
| "grad_norm": 4.156042584491376, | |
| "kl": 0.1044921875, | |
| "learning_rate": 4.3775100401606425e-07, | |
| "loss": 0.0042, | |
| "reward": 2.2150485515594482, | |
| "reward_std": 0.23025363683700562, | |
| "rewards/accuracy_reward": 1.2228610515594482, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 280 | |
| }, | |
| { | |
| "completion_length": 77.0390625, | |
| "epoch": 3.3855421686746987, | |
| "grad_norm": 3.3549823921313475, | |
| "kl": 0.100341796875, | |
| "learning_rate": 4.3574297188755017e-07, | |
| "loss": 0.004, | |
| "reward": 2.211505889892578, | |
| "reward_std": 0.24677567183971405, | |
| "rewards/accuracy_reward": 1.227130949497223, | |
| "rewards/format_reward": 0.984375, | |
| "step": 281 | |
| }, | |
| { | |
| "completion_length": 78.296875, | |
| "epoch": 3.397590361445783, | |
| "grad_norm": 3.5036767872389514, | |
| "kl": 0.0859375, | |
| "learning_rate": 4.3373493975903615e-07, | |
| "loss": 0.0034, | |
| "reward": 2.346588611602783, | |
| "reward_std": 0.20112959295511246, | |
| "rewards/accuracy_reward": 1.3465884923934937, | |
| "rewards/format_reward": 1.0, | |
| "step": 282 | |
| }, | |
| { | |
| "completion_length": 84.484375, | |
| "epoch": 3.4096385542168672, | |
| "grad_norm": 3.0794227415803874, | |
| "kl": 0.09326171875, | |
| "learning_rate": 4.3172690763052203e-07, | |
| "loss": 0.0037, | |
| "reward": 2.230928421020508, | |
| "reward_std": 0.26287955790758133, | |
| "rewards/accuracy_reward": 1.2387409210205078, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 283 | |
| }, | |
| { | |
| "completion_length": 84.0546875, | |
| "epoch": 3.4216867469879517, | |
| "grad_norm": 9.632017573370238, | |
| "kl": 0.086181640625, | |
| "learning_rate": 4.29718875502008e-07, | |
| "loss": 0.0034, | |
| "reward": 2.2049087285995483, | |
| "reward_std": 0.19046999514102936, | |
| "rewards/accuracy_reward": 1.204908847808838, | |
| "rewards/format_reward": 1.0, | |
| "step": 284 | |
| }, | |
| { | |
| "completion_length": 74.875, | |
| "epoch": 3.433734939759036, | |
| "grad_norm": 3.04437077789607, | |
| "kl": 0.07861328125, | |
| "learning_rate": 4.2771084337349393e-07, | |
| "loss": 0.0031, | |
| "reward": 2.3966974020004272, | |
| "reward_std": 0.1937796175479889, | |
| "rewards/accuracy_reward": 1.3966973423957825, | |
| "rewards/format_reward": 1.0, | |
| "step": 285 | |
| }, | |
| { | |
| "completion_length": 75.8359375, | |
| "epoch": 3.4457831325301207, | |
| "grad_norm": 5.311045139915637, | |
| "kl": 0.163330078125, | |
| "learning_rate": 4.257028112449799e-07, | |
| "loss": 0.0065, | |
| "reward": 2.3752543926239014, | |
| "reward_std": 0.2273067831993103, | |
| "rewards/accuracy_reward": 1.3830668926239014, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 286 | |
| }, | |
| { | |
| "completion_length": 78.6328125, | |
| "epoch": 3.4578313253012047, | |
| "grad_norm": 3.0911678350526763, | |
| "kl": 0.082763671875, | |
| "learning_rate": 4.2369477911646584e-07, | |
| "loss": 0.0033, | |
| "reward": 2.3473113775253296, | |
| "reward_std": 0.14994988590478897, | |
| "rewards/accuracy_reward": 1.3473113775253296, | |
| "rewards/format_reward": 1.0, | |
| "step": 287 | |
| }, | |
| { | |
| "completion_length": 79.1640625, | |
| "epoch": 3.4698795180722892, | |
| "grad_norm": 3.5847413181475947, | |
| "kl": 0.0849609375, | |
| "learning_rate": 4.216867469879518e-07, | |
| "loss": 0.0034, | |
| "reward": 2.433477997779846, | |
| "reward_std": 0.1769290268421173, | |
| "rewards/accuracy_reward": 1.4334778785705566, | |
| "rewards/format_reward": 1.0, | |
| "step": 288 | |
| }, | |
| { | |
| "completion_length": 83.390625, | |
| "epoch": 3.4819277108433733, | |
| "grad_norm": 4.01569190307187, | |
| "kl": 0.09521484375, | |
| "learning_rate": 4.1967871485943775e-07, | |
| "loss": 0.0038, | |
| "reward": 2.2789034843444824, | |
| "reward_std": 0.2845103293657303, | |
| "rewards/accuracy_reward": 1.2867161631584167, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 289 | |
| }, | |
| { | |
| "completion_length": 81.90625, | |
| "epoch": 3.4939759036144578, | |
| "grad_norm": 3.286849126987869, | |
| "kl": 0.08642578125, | |
| "learning_rate": 4.176706827309237e-07, | |
| "loss": 0.0035, | |
| "reward": 2.362874150276184, | |
| "reward_std": 0.19387810677289963, | |
| "rewards/accuracy_reward": 1.362874150276184, | |
| "rewards/format_reward": 1.0, | |
| "step": 290 | |
| }, | |
| { | |
| "completion_length": 82.6640625, | |
| "epoch": 3.5060240963855422, | |
| "grad_norm": 3.658103173473351, | |
| "kl": 0.10888671875, | |
| "learning_rate": 4.156626506024096e-07, | |
| "loss": 0.0043, | |
| "reward": 2.0810331106185913, | |
| "reward_std": 0.3057002127170563, | |
| "rewards/accuracy_reward": 1.088845670223236, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 291 | |
| }, | |
| { | |
| "completion_length": 78.921875, | |
| "epoch": 3.5180722891566267, | |
| "grad_norm": 3.7103596490236774, | |
| "kl": 0.08349609375, | |
| "learning_rate": 4.1365461847389553e-07, | |
| "loss": 0.0033, | |
| "reward": 2.511967420578003, | |
| "reward_std": 0.16890805214643478, | |
| "rewards/accuracy_reward": 1.5119673609733582, | |
| "rewards/format_reward": 1.0, | |
| "step": 292 | |
| }, | |
| { | |
| "completion_length": 79.0703125, | |
| "epoch": 3.5301204819277108, | |
| "grad_norm": 4.407185593870522, | |
| "kl": 0.099853515625, | |
| "learning_rate": 4.116465863453815e-07, | |
| "loss": 0.004, | |
| "reward": 2.298495650291443, | |
| "reward_std": 0.18783311545848846, | |
| "rewards/accuracy_reward": 1.2984956502914429, | |
| "rewards/format_reward": 1.0, | |
| "step": 293 | |
| }, | |
| { | |
| "completion_length": 77.796875, | |
| "epoch": 3.5421686746987953, | |
| "grad_norm": 4.826014110118868, | |
| "kl": 0.09814453125, | |
| "learning_rate": 4.0963855421686744e-07, | |
| "loss": 0.0039, | |
| "reward": 2.2871015071868896, | |
| "reward_std": 0.2442024052143097, | |
| "rewards/accuracy_reward": 1.2871016263961792, | |
| "rewards/format_reward": 1.0, | |
| "step": 294 | |
| }, | |
| { | |
| "completion_length": 81.0390625, | |
| "epoch": 3.5542168674698793, | |
| "grad_norm": 5.044218587715949, | |
| "kl": 0.1220703125, | |
| "learning_rate": 4.076305220883534e-07, | |
| "loss": 0.0049, | |
| "reward": 2.3120492696762085, | |
| "reward_std": 0.26864828169345856, | |
| "rewards/accuracy_reward": 1.3198617696762085, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 295 | |
| }, | |
| { | |
| "completion_length": 81.8046875, | |
| "epoch": 3.566265060240964, | |
| "grad_norm": 4.035337217053536, | |
| "kl": 0.102783203125, | |
| "learning_rate": 4.0562248995983934e-07, | |
| "loss": 0.0041, | |
| "reward": 2.2244678735733032, | |
| "reward_std": 0.19216852635145187, | |
| "rewards/accuracy_reward": 1.2244678139686584, | |
| "rewards/format_reward": 1.0, | |
| "step": 296 | |
| }, | |
| { | |
| "completion_length": 82.1875, | |
| "epoch": 3.5783132530120483, | |
| "grad_norm": 5.473424541297646, | |
| "kl": 0.082275390625, | |
| "learning_rate": 4.036144578313253e-07, | |
| "loss": 0.0033, | |
| "reward": 2.1482508182525635, | |
| "reward_std": 0.2517557144165039, | |
| "rewards/accuracy_reward": 1.1560633182525635, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 297 | |
| }, | |
| { | |
| "completion_length": 76.8828125, | |
| "epoch": 3.5903614457831328, | |
| "grad_norm": 3.624065660089473, | |
| "kl": 0.099609375, | |
| "learning_rate": 4.0160642570281125e-07, | |
| "loss": 0.004, | |
| "reward": 2.460606813430786, | |
| "reward_std": 0.20688265562057495, | |
| "rewards/accuracy_reward": 1.476231873035431, | |
| "rewards/format_reward": 0.984375, | |
| "step": 298 | |
| }, | |
| { | |
| "completion_length": 73.8828125, | |
| "epoch": 3.602409638554217, | |
| "grad_norm": 3.2496622555871775, | |
| "kl": 0.10302734375, | |
| "learning_rate": 3.995983935742971e-07, | |
| "loss": 0.0041, | |
| "reward": 2.448202967643738, | |
| "reward_std": 0.20513835549354553, | |
| "rewards/accuracy_reward": 1.4482029676437378, | |
| "rewards/format_reward": 1.0, | |
| "step": 299 | |
| }, | |
| { | |
| "completion_length": 73.8828125, | |
| "epoch": 3.6144578313253013, | |
| "grad_norm": 3.248403260656612, | |
| "kl": 0.1142578125, | |
| "learning_rate": 3.975903614457831e-07, | |
| "loss": 0.0046, | |
| "reward": 2.3579249382019043, | |
| "reward_std": 0.26106585562229156, | |
| "rewards/accuracy_reward": 1.3657374382019043, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 300 | |
| }, | |
| { | |
| "completion_length": 81.78125, | |
| "epoch": 3.6265060240963853, | |
| "grad_norm": 4.192951592702023, | |
| "kl": 0.090087890625, | |
| "learning_rate": 3.9558232931726903e-07, | |
| "loss": 0.0036, | |
| "reward": 2.320730686187744, | |
| "reward_std": 0.17225497588515282, | |
| "rewards/accuracy_reward": 1.3207308053970337, | |
| "rewards/format_reward": 1.0, | |
| "step": 301 | |
| }, | |
| { | |
| "completion_length": 81.78125, | |
| "epoch": 3.63855421686747, | |
| "grad_norm": 3.914334064533718, | |
| "kl": 0.082763671875, | |
| "learning_rate": 3.93574297188755e-07, | |
| "loss": 0.0033, | |
| "reward": 2.2756303548812866, | |
| "reward_std": 0.21440081298351288, | |
| "rewards/accuracy_reward": 1.2834429144859314, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 302 | |
| }, | |
| { | |
| "completion_length": 83.984375, | |
| "epoch": 3.6506024096385543, | |
| "grad_norm": 2.9158995310046705, | |
| "kl": 0.09326171875, | |
| "learning_rate": 3.9156626506024094e-07, | |
| "loss": 0.0037, | |
| "reward": 2.340207576751709, | |
| "reward_std": 0.22486132383346558, | |
| "rewards/accuracy_reward": 1.3402075171470642, | |
| "rewards/format_reward": 1.0, | |
| "step": 303 | |
| }, | |
| { | |
| "completion_length": 73.0078125, | |
| "epoch": 3.662650602409639, | |
| "grad_norm": 3.64523826351094, | |
| "kl": 0.130615234375, | |
| "learning_rate": 3.895582329317269e-07, | |
| "loss": 0.0052, | |
| "reward": 2.306045651435852, | |
| "reward_std": 0.21042678505182266, | |
| "rewards/accuracy_reward": 1.313858151435852, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 304 | |
| }, | |
| { | |
| "completion_length": 77.140625, | |
| "epoch": 3.674698795180723, | |
| "grad_norm": 4.763683185347457, | |
| "kl": 0.09619140625, | |
| "learning_rate": 3.8755020080321285e-07, | |
| "loss": 0.0038, | |
| "reward": 2.292635202407837, | |
| "reward_std": 0.24200939387083054, | |
| "rewards/accuracy_reward": 1.308260202407837, | |
| "rewards/format_reward": 0.984375, | |
| "step": 305 | |
| }, | |
| { | |
| "completion_length": 80.6875, | |
| "epoch": 3.6867469879518073, | |
| "grad_norm": 15.378313149094321, | |
| "kl": 0.130126953125, | |
| "learning_rate": 3.8554216867469877e-07, | |
| "loss": 0.0052, | |
| "reward": 2.2641184329986572, | |
| "reward_std": 0.20184506475925446, | |
| "rewards/accuracy_reward": 1.2719308137893677, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 306 | |
| }, | |
| { | |
| "completion_length": 72.4453125, | |
| "epoch": 3.6987951807228914, | |
| "grad_norm": 6.1838290298686225, | |
| "kl": 0.114501953125, | |
| "learning_rate": 3.835341365461847e-07, | |
| "loss": 0.0046, | |
| "reward": 2.4186692237854004, | |
| "reward_std": 0.20656991004943848, | |
| "rewards/accuracy_reward": 1.4264817833900452, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 307 | |
| }, | |
| { | |
| "completion_length": 73.71875, | |
| "epoch": 3.710843373493976, | |
| "grad_norm": 3.6680281562358794, | |
| "kl": 0.092041015625, | |
| "learning_rate": 3.815261044176707e-07, | |
| "loss": 0.0037, | |
| "reward": 2.3598402738571167, | |
| "reward_std": 0.1814076155424118, | |
| "rewards/accuracy_reward": 1.3598402738571167, | |
| "rewards/format_reward": 1.0, | |
| "step": 308 | |
| }, | |
| { | |
| "completion_length": 75.5625, | |
| "epoch": 3.7228915662650603, | |
| "grad_norm": 4.1513164017455635, | |
| "kl": 0.11962890625, | |
| "learning_rate": 3.795180722891566e-07, | |
| "loss": 0.0048, | |
| "reward": 2.2364041805267334, | |
| "reward_std": 0.20799466967582703, | |
| "rewards/accuracy_reward": 1.236404299736023, | |
| "rewards/format_reward": 1.0, | |
| "step": 309 | |
| }, | |
| { | |
| "completion_length": 76.2109375, | |
| "epoch": 3.734939759036145, | |
| "grad_norm": 4.53835509987933, | |
| "kl": 0.088623046875, | |
| "learning_rate": 3.7751004016064253e-07, | |
| "loss": 0.0036, | |
| "reward": 2.3527251482009888, | |
| "reward_std": 0.17692391574382782, | |
| "rewards/accuracy_reward": 1.3527252078056335, | |
| "rewards/format_reward": 1.0, | |
| "step": 310 | |
| }, | |
| { | |
| "completion_length": 80.4375, | |
| "epoch": 3.746987951807229, | |
| "grad_norm": 3.703393707261026, | |
| "kl": 0.1103515625, | |
| "learning_rate": 3.755020080321285e-07, | |
| "loss": 0.0044, | |
| "reward": 2.298377275466919, | |
| "reward_std": 0.21109677106142044, | |
| "rewards/accuracy_reward": 1.2983773350715637, | |
| "rewards/format_reward": 1.0, | |
| "step": 311 | |
| }, | |
| { | |
| "completion_length": 77.8125, | |
| "epoch": 3.7590361445783134, | |
| "grad_norm": 3.914375784414754, | |
| "kl": 0.138916015625, | |
| "learning_rate": 3.7349397590361444e-07, | |
| "loss": 0.0056, | |
| "reward": 2.1520947217941284, | |
| "reward_std": 0.19967754930257797, | |
| "rewards/accuracy_reward": 1.1520947813987732, | |
| "rewards/format_reward": 1.0, | |
| "step": 312 | |
| }, | |
| { | |
| "completion_length": 79.2578125, | |
| "epoch": 3.7710843373493974, | |
| "grad_norm": 5.606330092523797, | |
| "kl": 0.091064453125, | |
| "learning_rate": 3.714859437751004e-07, | |
| "loss": 0.0036, | |
| "reward": 2.3204472064971924, | |
| "reward_std": 0.1748044565320015, | |
| "rewards/accuracy_reward": 1.3204472661018372, | |
| "rewards/format_reward": 1.0, | |
| "step": 313 | |
| }, | |
| { | |
| "completion_length": 74.84375, | |
| "epoch": 3.783132530120482, | |
| "grad_norm": 3.2348525038063736, | |
| "kl": 0.08447265625, | |
| "learning_rate": 3.694779116465863e-07, | |
| "loss": 0.0034, | |
| "reward": 2.496751070022583, | |
| "reward_std": 0.2072158306837082, | |
| "rewards/accuracy_reward": 1.496751070022583, | |
| "rewards/format_reward": 1.0, | |
| "step": 314 | |
| }, | |
| { | |
| "completion_length": 74.296875, | |
| "epoch": 3.7951807228915664, | |
| "grad_norm": 3.7371491385040483, | |
| "kl": 0.0771484375, | |
| "learning_rate": 3.674698795180723e-07, | |
| "loss": 0.0031, | |
| "reward": 2.395453691482544, | |
| "reward_std": 0.16877512633800507, | |
| "rewards/accuracy_reward": 1.3954537510871887, | |
| "rewards/format_reward": 1.0, | |
| "step": 315 | |
| }, | |
| { | |
| "completion_length": 72.8671875, | |
| "epoch": 3.807228915662651, | |
| "grad_norm": 5.799331345023467, | |
| "kl": 0.09619140625, | |
| "learning_rate": 3.654618473895582e-07, | |
| "loss": 0.0039, | |
| "reward": 2.307594895362854, | |
| "reward_std": 0.1985296756029129, | |
| "rewards/accuracy_reward": 1.307594895362854, | |
| "rewards/format_reward": 1.0, | |
| "step": 316 | |
| }, | |
| { | |
| "completion_length": 72.84375, | |
| "epoch": 3.819277108433735, | |
| "grad_norm": 5.215215330938529, | |
| "kl": 0.11083984375, | |
| "learning_rate": 3.634538152610442e-07, | |
| "loss": 0.0044, | |
| "reward": 2.2713290452957153, | |
| "reward_std": 0.15980049967765808, | |
| "rewards/accuracy_reward": 1.2791414856910706, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 317 | |
| }, | |
| { | |
| "completion_length": 66.28125, | |
| "epoch": 3.8313253012048194, | |
| "grad_norm": 9.42828281313003, | |
| "kl": 0.106201171875, | |
| "learning_rate": 3.614457831325301e-07, | |
| "loss": 0.0042, | |
| "reward": 2.441011667251587, | |
| "reward_std": 0.21370699256658554, | |
| "rewards/accuracy_reward": 1.4566364884376526, | |
| "rewards/format_reward": 0.984375, | |
| "step": 318 | |
| }, | |
| { | |
| "completion_length": 74.3359375, | |
| "epoch": 3.8433734939759034, | |
| "grad_norm": 3.380164477319568, | |
| "kl": 0.094970703125, | |
| "learning_rate": 3.5943775100401604e-07, | |
| "loss": 0.0038, | |
| "reward": 2.5070927143096924, | |
| "reward_std": 0.16660126298666, | |
| "rewards/accuracy_reward": 1.5149051547050476, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 319 | |
| }, | |
| { | |
| "completion_length": 71.3046875, | |
| "epoch": 3.855421686746988, | |
| "grad_norm": 4.006205885169367, | |
| "kl": 0.128662109375, | |
| "learning_rate": 3.57429718875502e-07, | |
| "loss": 0.0051, | |
| "reward": 2.3042829036712646, | |
| "reward_std": 0.2031613141298294, | |
| "rewards/accuracy_reward": 1.3042829036712646, | |
| "rewards/format_reward": 1.0, | |
| "step": 320 | |
| }, | |
| { | |
| "completion_length": 73.9609375, | |
| "epoch": 3.8674698795180724, | |
| "grad_norm": 5.771036516275782, | |
| "kl": 0.093017578125, | |
| "learning_rate": 3.554216867469879e-07, | |
| "loss": 0.0037, | |
| "reward": 2.422416090965271, | |
| "reward_std": 0.19139418005943298, | |
| "rewards/accuracy_reward": 1.4302285313606262, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 321 | |
| }, | |
| { | |
| "completion_length": 71.734375, | |
| "epoch": 3.8795180722891565, | |
| "grad_norm": 5.860041479699707, | |
| "kl": 0.110595703125, | |
| "learning_rate": 3.5341365461847387e-07, | |
| "loss": 0.0044, | |
| "reward": 2.100473999977112, | |
| "reward_std": 0.21565508097410202, | |
| "rewards/accuracy_reward": 1.1004739999771118, | |
| "rewards/format_reward": 1.0, | |
| "step": 322 | |
| }, | |
| { | |
| "completion_length": 69.046875, | |
| "epoch": 3.891566265060241, | |
| "grad_norm": 4.962719097630754, | |
| "kl": 0.1396484375, | |
| "learning_rate": 3.514056224899598e-07, | |
| "loss": 0.0056, | |
| "reward": 2.337049961090088, | |
| "reward_std": 0.201468363404274, | |
| "rewards/accuracy_reward": 1.337049961090088, | |
| "rewards/format_reward": 1.0, | |
| "step": 323 | |
| }, | |
| { | |
| "completion_length": 70.0234375, | |
| "epoch": 3.9036144578313254, | |
| "grad_norm": 3.786778485554144, | |
| "kl": 0.1064453125, | |
| "learning_rate": 3.493975903614458e-07, | |
| "loss": 0.0043, | |
| "reward": 2.282514452934265, | |
| "reward_std": 0.2470734864473343, | |
| "rewards/accuracy_reward": 1.2903268933296204, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 324 | |
| }, | |
| { | |
| "completion_length": 66.5546875, | |
| "epoch": 3.9156626506024095, | |
| "grad_norm": 5.681847770854111, | |
| "kl": 0.14599609375, | |
| "learning_rate": 3.473895582329317e-07, | |
| "loss": 0.0059, | |
| "reward": 2.2830464839935303, | |
| "reward_std": 0.16951018571853638, | |
| "rewards/accuracy_reward": 1.2830466032028198, | |
| "rewards/format_reward": 1.0, | |
| "step": 325 | |
| }, | |
| { | |
| "completion_length": 69.9765625, | |
| "epoch": 3.927710843373494, | |
| "grad_norm": 3.545177223680582, | |
| "kl": 0.1123046875, | |
| "learning_rate": 3.453815261044177e-07, | |
| "loss": 0.0045, | |
| "reward": 2.3249276876449585, | |
| "reward_std": 0.23469389975070953, | |
| "rewards/accuracy_reward": 1.3249276876449585, | |
| "rewards/format_reward": 1.0, | |
| "step": 326 | |
| }, | |
| { | |
| "completion_length": 67.2109375, | |
| "epoch": 3.9397590361445785, | |
| "grad_norm": 4.464381426334607, | |
| "kl": 0.111328125, | |
| "learning_rate": 3.433734939759036e-07, | |
| "loss": 0.0045, | |
| "reward": 2.313346743583679, | |
| "reward_std": 0.24960950016975403, | |
| "rewards/accuracy_reward": 1.321159303188324, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 327 | |
| }, | |
| { | |
| "completion_length": 69.5390625, | |
| "epoch": 3.9518072289156625, | |
| "grad_norm": 5.503294892764904, | |
| "kl": 0.13818359375, | |
| "learning_rate": 3.413654618473896e-07, | |
| "loss": 0.0055, | |
| "reward": 2.250451922416687, | |
| "reward_std": 0.19627484679222107, | |
| "rewards/accuracy_reward": 1.2582644820213318, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 328 | |
| }, | |
| { | |
| "completion_length": 72.875, | |
| "epoch": 3.963855421686747, | |
| "grad_norm": 3.94333602961405, | |
| "kl": 0.126953125, | |
| "learning_rate": 3.3935742971887547e-07, | |
| "loss": 0.0051, | |
| "reward": 2.4282917976379395, | |
| "reward_std": 0.23817364871501923, | |
| "rewards/accuracy_reward": 1.4361043572425842, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 329 | |
| }, | |
| { | |
| "completion_length": 68.078125, | |
| "epoch": 3.9759036144578315, | |
| "grad_norm": 4.246221946155538, | |
| "kl": 0.10302734375, | |
| "learning_rate": 3.373493975903614e-07, | |
| "loss": 0.0041, | |
| "reward": 2.3756778240203857, | |
| "reward_std": 0.23032685369253159, | |
| "rewards/accuracy_reward": 1.3756778836250305, | |
| "rewards/format_reward": 1.0, | |
| "step": 330 | |
| }, | |
| { | |
| "completion_length": 63.171875, | |
| "epoch": 3.9879518072289155, | |
| "grad_norm": 4.823180720092978, | |
| "kl": 0.14111328125, | |
| "learning_rate": 3.353413654618474e-07, | |
| "loss": 0.0057, | |
| "reward": 2.2716495990753174, | |
| "reward_std": 0.25546562671661377, | |
| "rewards/accuracy_reward": 1.2794621586799622, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 331 | |
| }, | |
| { | |
| "completion_length": 79.75000381469727, | |
| "epoch": 4.0, | |
| "grad_norm": 3.966089593429622, | |
| "kl": 0.10986328125, | |
| "learning_rate": 3.333333333333333e-07, | |
| "loss": 0.0047, | |
| "reward": 1.9844202995300293, | |
| "reward_std": 0.41577973030507565, | |
| "rewards/accuracy_reward": 0.9844204187393188, | |
| "rewards/format_reward": 1.0, | |
| "step": 332 | |
| }, | |
| { | |
| "completion_length": 67.8984375, | |
| "epoch": 4.0120481927710845, | |
| "grad_norm": 3.4890518846644203, | |
| "kl": 0.112548828125, | |
| "learning_rate": 3.313253012048193e-07, | |
| "loss": 0.0045, | |
| "reward": 2.273194432258606, | |
| "reward_std": 0.1845482587814331, | |
| "rewards/accuracy_reward": 1.2810069918632507, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 333 | |
| }, | |
| { | |
| "completion_length": 70.1328125, | |
| "epoch": 4.024096385542169, | |
| "grad_norm": 3.1401475074211698, | |
| "kl": 0.106201171875, | |
| "learning_rate": 3.293172690763052e-07, | |
| "loss": 0.0042, | |
| "reward": 2.348654627799988, | |
| "reward_std": 0.20452319085597992, | |
| "rewards/accuracy_reward": 1.3564670085906982, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 334 | |
| }, | |
| { | |
| "completion_length": 67.4296875, | |
| "epoch": 4.036144578313253, | |
| "grad_norm": 4.049959483426693, | |
| "kl": 0.107177734375, | |
| "learning_rate": 3.273092369477912e-07, | |
| "loss": 0.0043, | |
| "reward": 2.270454525947571, | |
| "reward_std": 0.21142029762268066, | |
| "rewards/accuracy_reward": 1.2704546451568604, | |
| "rewards/format_reward": 1.0, | |
| "step": 335 | |
| }, | |
| { | |
| "completion_length": 71.1484375, | |
| "epoch": 4.048192771084337, | |
| "grad_norm": 3.9561612834766273, | |
| "kl": 0.097412109375, | |
| "learning_rate": 3.2530120481927706e-07, | |
| "loss": 0.0039, | |
| "reward": 2.1833893060684204, | |
| "reward_std": 0.1801520176231861, | |
| "rewards/accuracy_reward": 1.1912018656730652, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 336 | |
| }, | |
| { | |
| "completion_length": 69.59375, | |
| "epoch": 4.0602409638554215, | |
| "grad_norm": 3.977655100011985, | |
| "kl": 0.1474609375, | |
| "learning_rate": 3.2329317269076304e-07, | |
| "loss": 0.0059, | |
| "reward": 2.2047336101531982, | |
| "reward_std": 0.1999206244945526, | |
| "rewards/accuracy_reward": 1.204733669757843, | |
| "rewards/format_reward": 1.0, | |
| "step": 337 | |
| }, | |
| { | |
| "completion_length": 61.4765625, | |
| "epoch": 4.072289156626506, | |
| "grad_norm": 4.191698428231115, | |
| "kl": 0.12939453125, | |
| "learning_rate": 3.2128514056224897e-07, | |
| "loss": 0.0052, | |
| "reward": 2.3498200178146362, | |
| "reward_std": 0.2275300845503807, | |
| "rewards/accuracy_reward": 1.3498198986053467, | |
| "rewards/format_reward": 1.0, | |
| "step": 338 | |
| }, | |
| { | |
| "completion_length": 64.4140625, | |
| "epoch": 4.0843373493975905, | |
| "grad_norm": 3.9067810348739114, | |
| "kl": 0.116943359375, | |
| "learning_rate": 3.192771084337349e-07, | |
| "loss": 0.0047, | |
| "reward": 2.352308511734009, | |
| "reward_std": 0.22002745419740677, | |
| "rewards/accuracy_reward": 1.3523083925247192, | |
| "rewards/format_reward": 1.0, | |
| "step": 339 | |
| }, | |
| { | |
| "completion_length": 73.2890625, | |
| "epoch": 4.096385542168675, | |
| "grad_norm": 4.489032904646898, | |
| "kl": 0.104736328125, | |
| "learning_rate": 3.172690763052209e-07, | |
| "loss": 0.0042, | |
| "reward": 2.1710336208343506, | |
| "reward_std": 0.17718148604035378, | |
| "rewards/accuracy_reward": 1.1710334420204163, | |
| "rewards/format_reward": 1.0, | |
| "step": 340 | |
| }, | |
| { | |
| "completion_length": 74.3671875, | |
| "epoch": 4.108433734939759, | |
| "grad_norm": 4.230949730619595, | |
| "kl": 0.139892578125, | |
| "learning_rate": 3.152610441767068e-07, | |
| "loss": 0.0056, | |
| "reward": 2.084486246109009, | |
| "reward_std": 0.2170683741569519, | |
| "rewards/accuracy_reward": 1.0922988057136536, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 341 | |
| }, | |
| { | |
| "completion_length": 65.5625, | |
| "epoch": 4.120481927710843, | |
| "grad_norm": 5.461293103432774, | |
| "kl": 0.1044921875, | |
| "learning_rate": 3.132530120481928e-07, | |
| "loss": 0.0042, | |
| "reward": 2.381394147872925, | |
| "reward_std": 0.193039670586586, | |
| "rewards/accuracy_reward": 1.38139408826828, | |
| "rewards/format_reward": 1.0, | |
| "step": 342 | |
| }, | |
| { | |
| "completion_length": 66.15625, | |
| "epoch": 4.132530120481928, | |
| "grad_norm": 4.070866693962467, | |
| "kl": 0.111572265625, | |
| "learning_rate": 3.112449799196787e-07, | |
| "loss": 0.0045, | |
| "reward": 2.357278347015381, | |
| "reward_std": 0.15215902030467987, | |
| "rewards/accuracy_reward": 1.3729035258293152, | |
| "rewards/format_reward": 0.984375, | |
| "step": 343 | |
| }, | |
| { | |
| "completion_length": 69.1328125, | |
| "epoch": 4.144578313253012, | |
| "grad_norm": 4.335873726549927, | |
| "kl": 0.123046875, | |
| "learning_rate": 3.0923694779116464e-07, | |
| "loss": 0.0049, | |
| "reward": 2.282222032546997, | |
| "reward_std": 0.25280918926000595, | |
| "rewards/accuracy_reward": 1.2978470921516418, | |
| "rewards/format_reward": 0.984375, | |
| "step": 344 | |
| }, | |
| { | |
| "completion_length": 73.6015625, | |
| "epoch": 4.156626506024097, | |
| "grad_norm": 4.412489990442917, | |
| "kl": 0.09765625, | |
| "learning_rate": 3.0722891566265056e-07, | |
| "loss": 0.0039, | |
| "reward": 2.421238660812378, | |
| "reward_std": 0.21779820322990417, | |
| "rewards/accuracy_reward": 1.4290512800216675, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 345 | |
| }, | |
| { | |
| "completion_length": 67.3984375, | |
| "epoch": 4.168674698795181, | |
| "grad_norm": 3.7050619604015775, | |
| "kl": 0.111083984375, | |
| "learning_rate": 3.0522088353413654e-07, | |
| "loss": 0.0044, | |
| "reward": 2.4159966707229614, | |
| "reward_std": 0.17116259038448334, | |
| "rewards/accuracy_reward": 1.4159966707229614, | |
| "rewards/format_reward": 1.0, | |
| "step": 346 | |
| }, | |
| { | |
| "completion_length": 68.7109375, | |
| "epoch": 4.180722891566265, | |
| "grad_norm": 4.638840034522594, | |
| "kl": 0.119873046875, | |
| "learning_rate": 3.0321285140562247e-07, | |
| "loss": 0.0048, | |
| "reward": 2.430918335914612, | |
| "reward_std": 0.23829656839370728, | |
| "rewards/accuracy_reward": 1.4309183359146118, | |
| "rewards/format_reward": 1.0, | |
| "step": 347 | |
| }, | |
| { | |
| "completion_length": 68.203125, | |
| "epoch": 4.192771084337349, | |
| "grad_norm": 7.531973472034052, | |
| "kl": 0.124267578125, | |
| "learning_rate": 3.0120481927710845e-07, | |
| "loss": 0.005, | |
| "reward": 2.2654261589050293, | |
| "reward_std": 0.214869923889637, | |
| "rewards/accuracy_reward": 1.2966760993003845, | |
| "rewards/format_reward": 0.96875, | |
| "step": 348 | |
| }, | |
| { | |
| "completion_length": 66.3046875, | |
| "epoch": 4.204819277108434, | |
| "grad_norm": 6.290139006407989, | |
| "kl": 0.15673828125, | |
| "learning_rate": 2.991967871485944e-07, | |
| "loss": 0.0063, | |
| "reward": 2.440833330154419, | |
| "reward_std": 0.20570393651723862, | |
| "rewards/accuracy_reward": 1.4642709493637085, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 349 | |
| }, | |
| { | |
| "completion_length": 68.5078125, | |
| "epoch": 4.216867469879518, | |
| "grad_norm": 3.870085506410607, | |
| "kl": 0.11376953125, | |
| "learning_rate": 2.971887550200803e-07, | |
| "loss": 0.0046, | |
| "reward": 2.4419082403182983, | |
| "reward_std": 0.1332126259803772, | |
| "rewards/accuracy_reward": 1.441908359527588, | |
| "rewards/format_reward": 1.0, | |
| "step": 350 | |
| }, | |
| { | |
| "completion_length": 67.7109375, | |
| "epoch": 4.228915662650603, | |
| "grad_norm": 5.222390077968289, | |
| "kl": 0.12548828125, | |
| "learning_rate": 2.9518072289156623e-07, | |
| "loss": 0.005, | |
| "reward": 2.354392647743225, | |
| "reward_std": 0.250136561691761, | |
| "rewards/accuracy_reward": 1.3700175285339355, | |
| "rewards/format_reward": 0.984375, | |
| "step": 351 | |
| }, | |
| { | |
| "completion_length": 63.75, | |
| "epoch": 4.240963855421687, | |
| "grad_norm": 5.7394258697520835, | |
| "kl": 0.13671875, | |
| "learning_rate": 2.9317269076305216e-07, | |
| "loss": 0.0055, | |
| "reward": 2.1846532821655273, | |
| "reward_std": 0.27685467153787613, | |
| "rewards/accuracy_reward": 1.2080907225608826, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 352 | |
| }, | |
| { | |
| "completion_length": 68.734375, | |
| "epoch": 4.253012048192771, | |
| "grad_norm": 3.522967170920438, | |
| "kl": 0.10400390625, | |
| "learning_rate": 2.9116465863453814e-07, | |
| "loss": 0.0041, | |
| "reward": 2.315014600753784, | |
| "reward_std": 0.13816260546445847, | |
| "rewards/accuracy_reward": 1.3150146007537842, | |
| "rewards/format_reward": 1.0, | |
| "step": 353 | |
| }, | |
| { | |
| "completion_length": 72.8125, | |
| "epoch": 4.265060240963855, | |
| "grad_norm": 3.727859373676823, | |
| "kl": 0.12939453125, | |
| "learning_rate": 2.8915662650602407e-07, | |
| "loss": 0.0052, | |
| "reward": 2.206972360610962, | |
| "reward_std": 0.23467965424060822, | |
| "rewards/accuracy_reward": 1.2069722414016724, | |
| "rewards/format_reward": 1.0, | |
| "step": 354 | |
| }, | |
| { | |
| "completion_length": 70.3359375, | |
| "epoch": 4.27710843373494, | |
| "grad_norm": 3.380662774166939, | |
| "kl": 0.09716796875, | |
| "learning_rate": 2.8714859437751005e-07, | |
| "loss": 0.0039, | |
| "reward": 2.1916306018829346, | |
| "reward_std": 0.23339906334877014, | |
| "rewards/accuracy_reward": 1.2072556018829346, | |
| "rewards/format_reward": 0.984375, | |
| "step": 355 | |
| }, | |
| { | |
| "completion_length": 72.4375, | |
| "epoch": 4.289156626506024, | |
| "grad_norm": 3.5703829288777764, | |
| "kl": 0.11376953125, | |
| "learning_rate": 2.85140562248996e-07, | |
| "loss": 0.0046, | |
| "reward": 2.142443895339966, | |
| "reward_std": 0.2050827294588089, | |
| "rewards/accuracy_reward": 1.1580689549446106, | |
| "rewards/format_reward": 0.984375, | |
| "step": 356 | |
| }, | |
| { | |
| "completion_length": 66.9921875, | |
| "epoch": 4.301204819277109, | |
| "grad_norm": 3.6787951883313275, | |
| "kl": 0.119873046875, | |
| "learning_rate": 2.8313253012048195e-07, | |
| "loss": 0.0048, | |
| "reward": 2.6013587713241577, | |
| "reward_std": 0.17792491614818573, | |
| "rewards/accuracy_reward": 1.6013588309288025, | |
| "rewards/format_reward": 1.0, | |
| "step": 357 | |
| }, | |
| { | |
| "completion_length": 67.1875, | |
| "epoch": 4.313253012048193, | |
| "grad_norm": 7.9299540096420476, | |
| "kl": 0.111328125, | |
| "learning_rate": 2.811244979919679e-07, | |
| "loss": 0.0044, | |
| "reward": 2.2114800214767456, | |
| "reward_std": 0.2541910707950592, | |
| "rewards/accuracy_reward": 1.2271050810813904, | |
| "rewards/format_reward": 0.984375, | |
| "step": 358 | |
| }, | |
| { | |
| "completion_length": 69.1953125, | |
| "epoch": 4.325301204819277, | |
| "grad_norm": 3.7315177619787687, | |
| "kl": 0.10400390625, | |
| "learning_rate": 2.7911646586345376e-07, | |
| "loss": 0.0042, | |
| "reward": 2.2850147485733032, | |
| "reward_std": 0.24116653203964233, | |
| "rewards/accuracy_reward": 1.3084524869918823, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 359 | |
| }, | |
| { | |
| "completion_length": 76.6640625, | |
| "epoch": 4.337349397590361, | |
| "grad_norm": 3.8031600707561886, | |
| "kl": 0.08984375, | |
| "learning_rate": 2.7710843373493974e-07, | |
| "loss": 0.0036, | |
| "reward": 2.372725009918213, | |
| "reward_std": 0.23598377406597137, | |
| "rewards/accuracy_reward": 1.380537509918213, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 360 | |
| }, | |
| { | |
| "completion_length": 72.6015625, | |
| "epoch": 4.349397590361446, | |
| "grad_norm": 6.29903230301134, | |
| "kl": 0.10205078125, | |
| "learning_rate": 2.7510040160642566e-07, | |
| "loss": 0.0041, | |
| "reward": 2.3671088218688965, | |
| "reward_std": 0.21375955641269684, | |
| "rewards/accuracy_reward": 1.3749213814735413, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 361 | |
| }, | |
| { | |
| "completion_length": 74.546875, | |
| "epoch": 4.36144578313253, | |
| "grad_norm": 4.5097271327174555, | |
| "kl": 0.100341796875, | |
| "learning_rate": 2.7309236947791164e-07, | |
| "loss": 0.004, | |
| "reward": 2.338581085205078, | |
| "reward_std": 0.21793486177921295, | |
| "rewards/accuracy_reward": 1.3463934063911438, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 362 | |
| }, | |
| { | |
| "completion_length": 73.203125, | |
| "epoch": 4.373493975903615, | |
| "grad_norm": 7.563928087147195, | |
| "kl": 0.093505859375, | |
| "learning_rate": 2.7108433734939757e-07, | |
| "loss": 0.0037, | |
| "reward": 2.4811813831329346, | |
| "reward_std": 0.1661686971783638, | |
| "rewards/accuracy_reward": 1.4811814427375793, | |
| "rewards/format_reward": 1.0, | |
| "step": 363 | |
| }, | |
| { | |
| "completion_length": 72.2109375, | |
| "epoch": 4.385542168674699, | |
| "grad_norm": 4.157739455544304, | |
| "kl": 0.11767578125, | |
| "learning_rate": 2.6907630522088355e-07, | |
| "loss": 0.0047, | |
| "reward": 2.227518320083618, | |
| "reward_std": 0.2459297701716423, | |
| "rewards/accuracy_reward": 1.235330879688263, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 364 | |
| }, | |
| { | |
| "completion_length": 73.125, | |
| "epoch": 4.397590361445783, | |
| "grad_norm": 3.957643739786318, | |
| "kl": 0.130126953125, | |
| "learning_rate": 2.670682730923695e-07, | |
| "loss": 0.0052, | |
| "reward": 2.398737668991089, | |
| "reward_std": 0.2508920058608055, | |
| "rewards/accuracy_reward": 1.406550109386444, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 365 | |
| }, | |
| { | |
| "completion_length": 80.6484375, | |
| "epoch": 4.409638554216867, | |
| "grad_norm": 8.267939908268028, | |
| "kl": 0.126220703125, | |
| "learning_rate": 2.6506024096385546e-07, | |
| "loss": 0.005, | |
| "reward": 2.1884970664978027, | |
| "reward_std": 0.32723745703697205, | |
| "rewards/accuracy_reward": 1.2119346857070923, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 366 | |
| }, | |
| { | |
| "completion_length": 80.09375, | |
| "epoch": 4.421686746987952, | |
| "grad_norm": 3.0023836541953988, | |
| "kl": 0.089111328125, | |
| "learning_rate": 2.6305220883534133e-07, | |
| "loss": 0.0036, | |
| "reward": 2.4019484519958496, | |
| "reward_std": 0.20879995077848434, | |
| "rewards/accuracy_reward": 1.4019483923912048, | |
| "rewards/format_reward": 1.0, | |
| "step": 367 | |
| }, | |
| { | |
| "completion_length": 76.890625, | |
| "epoch": 4.433734939759036, | |
| "grad_norm": 3.8760535577901916, | |
| "kl": 0.110107421875, | |
| "learning_rate": 2.610441767068273e-07, | |
| "loss": 0.0044, | |
| "reward": 2.217389702796936, | |
| "reward_std": 0.20581622421741486, | |
| "rewards/accuracy_reward": 1.225202202796936, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 368 | |
| }, | |
| { | |
| "completion_length": 70.046875, | |
| "epoch": 4.445783132530121, | |
| "grad_norm": 4.189426211226252, | |
| "kl": 0.09912109375, | |
| "learning_rate": 2.5903614457831324e-07, | |
| "loss": 0.004, | |
| "reward": 2.3884357213974, | |
| "reward_std": 0.23216703534126282, | |
| "rewards/accuracy_reward": 1.4118732213974, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 369 | |
| }, | |
| { | |
| "completion_length": 75.3125, | |
| "epoch": 4.457831325301205, | |
| "grad_norm": 3.5709834038432886, | |
| "kl": 0.112060546875, | |
| "learning_rate": 2.5702811244979916e-07, | |
| "loss": 0.0045, | |
| "reward": 2.4395360946655273, | |
| "reward_std": 0.25345855951309204, | |
| "rewards/accuracy_reward": 1.4551611542701721, | |
| "rewards/format_reward": 0.984375, | |
| "step": 370 | |
| }, | |
| { | |
| "completion_length": 76.03125, | |
| "epoch": 4.469879518072289, | |
| "grad_norm": 3.8012985013892897, | |
| "kl": 0.11962890625, | |
| "learning_rate": 2.5502008032128514e-07, | |
| "loss": 0.0048, | |
| "reward": 2.2614444494247437, | |
| "reward_std": 0.25984859466552734, | |
| "rewards/accuracy_reward": 1.2692569494247437, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 371 | |
| }, | |
| { | |
| "completion_length": 72.34375, | |
| "epoch": 4.481927710843373, | |
| "grad_norm": 3.81905493683615, | |
| "kl": 0.118408203125, | |
| "learning_rate": 2.5301204819277107e-07, | |
| "loss": 0.0047, | |
| "reward": 2.24534273147583, | |
| "reward_std": 0.2783522978425026, | |
| "rewards/accuracy_reward": 1.25315523147583, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 372 | |
| }, | |
| { | |
| "completion_length": 73.625, | |
| "epoch": 4.493975903614458, | |
| "grad_norm": 5.859434170398068, | |
| "kl": 0.129638671875, | |
| "learning_rate": 2.5100401606425705e-07, | |
| "loss": 0.0052, | |
| "reward": 2.242166519165039, | |
| "reward_std": 0.19818732887506485, | |
| "rewards/accuracy_reward": 1.2421664595603943, | |
| "rewards/format_reward": 1.0, | |
| "step": 373 | |
| }, | |
| { | |
| "completion_length": 70.7734375, | |
| "epoch": 4.506024096385542, | |
| "grad_norm": 4.577359942879205, | |
| "kl": 0.113037109375, | |
| "learning_rate": 2.489959839357429e-07, | |
| "loss": 0.0045, | |
| "reward": 2.40807843208313, | |
| "reward_std": 0.16506175324320793, | |
| "rewards/accuracy_reward": 1.408078372478485, | |
| "rewards/format_reward": 1.0, | |
| "step": 374 | |
| }, | |
| { | |
| "completion_length": 71.6484375, | |
| "epoch": 4.518072289156627, | |
| "grad_norm": 3.6969886550918627, | |
| "kl": 0.0947265625, | |
| "learning_rate": 2.469879518072289e-07, | |
| "loss": 0.0038, | |
| "reward": 2.4090828895568848, | |
| "reward_std": 0.17872843891382217, | |
| "rewards/accuracy_reward": 1.4090829491615295, | |
| "rewards/format_reward": 1.0, | |
| "step": 375 | |
| }, | |
| { | |
| "completion_length": 75.640625, | |
| "epoch": 4.530120481927711, | |
| "grad_norm": 3.182069910394249, | |
| "kl": 0.112548828125, | |
| "learning_rate": 2.4497991967871483e-07, | |
| "loss": 0.0045, | |
| "reward": 2.429325222969055, | |
| "reward_std": 0.18355486541986465, | |
| "rewards/accuracy_reward": 1.4371376037597656, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 376 | |
| }, | |
| { | |
| "completion_length": 76.8515625, | |
| "epoch": 4.542168674698795, | |
| "grad_norm": 4.3761923522139625, | |
| "kl": 0.103515625, | |
| "learning_rate": 2.429718875502008e-07, | |
| "loss": 0.0041, | |
| "reward": 2.215627670288086, | |
| "reward_std": 0.29024538397789, | |
| "rewards/accuracy_reward": 1.2234401106834412, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 377 | |
| }, | |
| { | |
| "completion_length": 72.640625, | |
| "epoch": 4.554216867469879, | |
| "grad_norm": 5.739152465768093, | |
| "kl": 0.096923828125, | |
| "learning_rate": 2.4096385542168674e-07, | |
| "loss": 0.0039, | |
| "reward": 2.3864386081695557, | |
| "reward_std": 0.14991050213575363, | |
| "rewards/accuracy_reward": 1.3864384889602661, | |
| "rewards/format_reward": 1.0, | |
| "step": 378 | |
| }, | |
| { | |
| "completion_length": 73.7890625, | |
| "epoch": 4.566265060240964, | |
| "grad_norm": 4.330609617515541, | |
| "kl": 0.105712890625, | |
| "learning_rate": 2.3895582329317267e-07, | |
| "loss": 0.0042, | |
| "reward": 2.2676793336868286, | |
| "reward_std": 0.1841476932168007, | |
| "rewards/accuracy_reward": 1.2754917740821838, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 379 | |
| }, | |
| { | |
| "completion_length": 69.5859375, | |
| "epoch": 4.578313253012048, | |
| "grad_norm": 16.70825245009543, | |
| "kl": 0.103515625, | |
| "learning_rate": 2.3694779116465862e-07, | |
| "loss": 0.0041, | |
| "reward": 2.3687047958374023, | |
| "reward_std": 0.23368250578641891, | |
| "rewards/accuracy_reward": 1.3765172958374023, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 380 | |
| }, | |
| { | |
| "completion_length": 68.5703125, | |
| "epoch": 4.590361445783133, | |
| "grad_norm": 4.946973705468274, | |
| "kl": 0.11865234375, | |
| "learning_rate": 2.3493975903614457e-07, | |
| "loss": 0.0047, | |
| "reward": 2.409714102745056, | |
| "reward_std": 0.17494437843561172, | |
| "rewards/accuracy_reward": 1.4175265431404114, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 381 | |
| }, | |
| { | |
| "completion_length": 69.09375, | |
| "epoch": 4.602409638554217, | |
| "grad_norm": 3.4407209788639155, | |
| "kl": 0.108154296875, | |
| "learning_rate": 2.3293172690763053e-07, | |
| "loss": 0.0043, | |
| "reward": 2.3722596168518066, | |
| "reward_std": 0.2456066906452179, | |
| "rewards/accuracy_reward": 1.3722596764564514, | |
| "rewards/format_reward": 1.0, | |
| "step": 382 | |
| }, | |
| { | |
| "completion_length": 73.40625, | |
| "epoch": 4.614457831325301, | |
| "grad_norm": 6.785057754949663, | |
| "kl": 0.093017578125, | |
| "learning_rate": 2.3092369477911648e-07, | |
| "loss": 0.0037, | |
| "reward": 2.390730619430542, | |
| "reward_std": 0.13034258037805557, | |
| "rewards/accuracy_reward": 1.390730619430542, | |
| "rewards/format_reward": 1.0, | |
| "step": 383 | |
| }, | |
| { | |
| "completion_length": 69.578125, | |
| "epoch": 4.626506024096385, | |
| "grad_norm": 4.146766679362004, | |
| "kl": 0.110107421875, | |
| "learning_rate": 2.2891566265060238e-07, | |
| "loss": 0.0044, | |
| "reward": 2.457837224006653, | |
| "reward_std": 0.19646844267845154, | |
| "rewards/accuracy_reward": 1.465649664402008, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 384 | |
| }, | |
| { | |
| "completion_length": 71.4765625, | |
| "epoch": 4.63855421686747, | |
| "grad_norm": 3.5134218173180884, | |
| "kl": 0.10791015625, | |
| "learning_rate": 2.2690763052208834e-07, | |
| "loss": 0.0043, | |
| "reward": 2.2395870685577393, | |
| "reward_std": 0.23986083269119263, | |
| "rewards/accuracy_reward": 1.2630245089530945, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 385 | |
| }, | |
| { | |
| "completion_length": 67.8984375, | |
| "epoch": 4.650602409638554, | |
| "grad_norm": 3.5532098801033323, | |
| "kl": 0.112060546875, | |
| "learning_rate": 2.248995983935743e-07, | |
| "loss": 0.0045, | |
| "reward": 2.155800759792328, | |
| "reward_std": 0.26599714159965515, | |
| "rewards/accuracy_reward": 1.1714258790016174, | |
| "rewards/format_reward": 0.984375, | |
| "step": 386 | |
| }, | |
| { | |
| "completion_length": 67.921875, | |
| "epoch": 4.662650602409639, | |
| "grad_norm": 3.977191337497143, | |
| "kl": 0.12353515625, | |
| "learning_rate": 2.2289156626506022e-07, | |
| "loss": 0.0049, | |
| "reward": 2.1573885679244995, | |
| "reward_std": 0.19674725830554962, | |
| "rewards/accuracy_reward": 1.165201187133789, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 387 | |
| }, | |
| { | |
| "completion_length": 73.3671875, | |
| "epoch": 4.674698795180722, | |
| "grad_norm": 3.4384187805900894, | |
| "kl": 0.1005859375, | |
| "learning_rate": 2.2088353413654617e-07, | |
| "loss": 0.004, | |
| "reward": 2.238619089126587, | |
| "reward_std": 0.1663391888141632, | |
| "rewards/accuracy_reward": 1.2386190295219421, | |
| "rewards/format_reward": 1.0, | |
| "step": 388 | |
| }, | |
| { | |
| "completion_length": 71.3515625, | |
| "epoch": 4.686746987951807, | |
| "grad_norm": 3.6715987846617737, | |
| "kl": 0.1103515625, | |
| "learning_rate": 2.1887550200803212e-07, | |
| "loss": 0.0044, | |
| "reward": 2.2813053131103516, | |
| "reward_std": 0.20307840406894684, | |
| "rewards/accuracy_reward": 1.2891177535057068, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 389 | |
| }, | |
| { | |
| "completion_length": 67.8671875, | |
| "epoch": 4.698795180722891, | |
| "grad_norm": 4.1990886176906566, | |
| "kl": 0.1181640625, | |
| "learning_rate": 2.1686746987951808e-07, | |
| "loss": 0.0047, | |
| "reward": 2.3316123485565186, | |
| "reward_std": 0.18899912387132645, | |
| "rewards/accuracy_reward": 1.339424967765808, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 390 | |
| }, | |
| { | |
| "completion_length": 73.5390625, | |
| "epoch": 4.710843373493976, | |
| "grad_norm": 4.5848307121684035, | |
| "kl": 0.11767578125, | |
| "learning_rate": 2.14859437751004e-07, | |
| "loss": 0.0047, | |
| "reward": 2.3556346893310547, | |
| "reward_std": 0.17518161982297897, | |
| "rewards/accuracy_reward": 1.3634473085403442, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 391 | |
| }, | |
| { | |
| "completion_length": 73.3828125, | |
| "epoch": 4.72289156626506, | |
| "grad_norm": 4.308895887462787, | |
| "kl": 0.09716796875, | |
| "learning_rate": 2.1285140562248996e-07, | |
| "loss": 0.0039, | |
| "reward": 2.3230199813842773, | |
| "reward_std": 0.2215501293540001, | |
| "rewards/accuracy_reward": 1.3230200409889221, | |
| "rewards/format_reward": 1.0, | |
| "step": 392 | |
| }, | |
| { | |
| "completion_length": 71.625, | |
| "epoch": 4.734939759036145, | |
| "grad_norm": 3.8869195849917335, | |
| "kl": 0.117919921875, | |
| "learning_rate": 2.108433734939759e-07, | |
| "loss": 0.0047, | |
| "reward": 2.311624765396118, | |
| "reward_std": 0.233637273311615, | |
| "rewards/accuracy_reward": 1.3116250038146973, | |
| "rewards/format_reward": 1.0, | |
| "step": 393 | |
| }, | |
| { | |
| "completion_length": 67.828125, | |
| "epoch": 4.746987951807229, | |
| "grad_norm": 4.950759054297939, | |
| "kl": 0.10888671875, | |
| "learning_rate": 2.0883534136546184e-07, | |
| "loss": 0.0044, | |
| "reward": 2.379747152328491, | |
| "reward_std": 0.19298578798770905, | |
| "rewards/accuracy_reward": 1.3797469735145569, | |
| "rewards/format_reward": 1.0, | |
| "step": 394 | |
| }, | |
| { | |
| "completion_length": 72.2578125, | |
| "epoch": 4.759036144578313, | |
| "grad_norm": 45.47765651174386, | |
| "kl": 0.126708984375, | |
| "learning_rate": 2.0682730923694776e-07, | |
| "loss": 0.0051, | |
| "reward": 2.078563928604126, | |
| "reward_std": 0.253988578915596, | |
| "rewards/accuracy_reward": 1.0941888689994812, | |
| "rewards/format_reward": 0.984375, | |
| "step": 395 | |
| }, | |
| { | |
| "completion_length": 71.6484375, | |
| "epoch": 4.771084337349397, | |
| "grad_norm": 6.044646695827286, | |
| "kl": 0.13916015625, | |
| "learning_rate": 2.0481927710843372e-07, | |
| "loss": 0.0056, | |
| "reward": 2.485829472541809, | |
| "reward_std": 0.180104598402977, | |
| "rewards/accuracy_reward": 1.4858292937278748, | |
| "rewards/format_reward": 1.0, | |
| "step": 396 | |
| }, | |
| { | |
| "completion_length": 65.09375, | |
| "epoch": 4.783132530120482, | |
| "grad_norm": 4.360820446081869, | |
| "kl": 0.1416015625, | |
| "learning_rate": 2.0281124497991967e-07, | |
| "loss": 0.0057, | |
| "reward": 2.1638635396957397, | |
| "reward_std": 0.31551285088062286, | |
| "rewards/accuracy_reward": 1.1873010993003845, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 397 | |
| }, | |
| { | |
| "completion_length": 70.6328125, | |
| "epoch": 4.795180722891566, | |
| "grad_norm": 5.234619949658262, | |
| "kl": 0.115966796875, | |
| "learning_rate": 2.0080321285140563e-07, | |
| "loss": 0.0046, | |
| "reward": 2.424190402030945, | |
| "reward_std": 0.23157334327697754, | |
| "rewards/accuracy_reward": 1.4241904616355896, | |
| "rewards/format_reward": 1.0, | |
| "step": 398 | |
| }, | |
| { | |
| "completion_length": 70.4375, | |
| "epoch": 4.807228915662651, | |
| "grad_norm": 5.2543384630783265, | |
| "kl": 0.12060546875, | |
| "learning_rate": 1.9879518072289155e-07, | |
| "loss": 0.0048, | |
| "reward": 2.3333520889282227, | |
| "reward_std": 0.2145429253578186, | |
| "rewards/accuracy_reward": 1.3411647081375122, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 399 | |
| }, | |
| { | |
| "completion_length": 65.421875, | |
| "epoch": 4.8192771084337345, | |
| "grad_norm": 6.050688926597152, | |
| "kl": 0.125732421875, | |
| "learning_rate": 1.967871485943775e-07, | |
| "loss": 0.005, | |
| "reward": 2.412783145904541, | |
| "reward_std": 0.2059781178832054, | |
| "rewards/accuracy_reward": 1.420595645904541, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 400 | |
| }, | |
| { | |
| "completion_length": 63.5546875, | |
| "epoch": 4.831325301204819, | |
| "grad_norm": 4.14350718873446, | |
| "kl": 0.143798828125, | |
| "learning_rate": 1.9477911646586346e-07, | |
| "loss": 0.0057, | |
| "reward": 2.3667309284210205, | |
| "reward_std": 0.1764308363199234, | |
| "rewards/accuracy_reward": 1.3745434284210205, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 401 | |
| }, | |
| { | |
| "completion_length": 71.8671875, | |
| "epoch": 4.843373493975903, | |
| "grad_norm": 4.134424932683493, | |
| "kl": 0.126953125, | |
| "learning_rate": 1.9277108433734939e-07, | |
| "loss": 0.0051, | |
| "reward": 2.2129541635513306, | |
| "reward_std": 0.1565767452120781, | |
| "rewards/accuracy_reward": 1.2129541635513306, | |
| "rewards/format_reward": 1.0, | |
| "step": 402 | |
| }, | |
| { | |
| "completion_length": 64.0390625, | |
| "epoch": 4.855421686746988, | |
| "grad_norm": 4.135875391105592, | |
| "kl": 0.166015625, | |
| "learning_rate": 1.9076305220883534e-07, | |
| "loss": 0.0066, | |
| "reward": 2.3259581327438354, | |
| "reward_std": 0.2349315583705902, | |
| "rewards/accuracy_reward": 1.3259583115577698, | |
| "rewards/format_reward": 1.0, | |
| "step": 403 | |
| }, | |
| { | |
| "completion_length": 66.515625, | |
| "epoch": 4.867469879518072, | |
| "grad_norm": 4.276605246406482, | |
| "kl": 0.138916015625, | |
| "learning_rate": 1.8875502008032127e-07, | |
| "loss": 0.0056, | |
| "reward": 2.306966781616211, | |
| "reward_std": 0.2081274688243866, | |
| "rewards/accuracy_reward": 1.3069666624069214, | |
| "rewards/format_reward": 1.0, | |
| "step": 404 | |
| }, | |
| { | |
| "completion_length": 62.28125, | |
| "epoch": 4.879518072289157, | |
| "grad_norm": 4.594134632277065, | |
| "kl": 0.1826171875, | |
| "learning_rate": 1.8674698795180722e-07, | |
| "loss": 0.0073, | |
| "reward": 2.126552700996399, | |
| "reward_std": 0.255823478102684, | |
| "rewards/accuracy_reward": 1.1421778202056885, | |
| "rewards/format_reward": 0.984375, | |
| "step": 405 | |
| }, | |
| { | |
| "completion_length": 62.3671875, | |
| "epoch": 4.891566265060241, | |
| "grad_norm": 3.568434088807843, | |
| "kl": 0.14013671875, | |
| "learning_rate": 1.8473895582329315e-07, | |
| "loss": 0.0056, | |
| "reward": 2.417848587036133, | |
| "reward_std": 0.22225632518529892, | |
| "rewards/accuracy_reward": 1.4334735870361328, | |
| "rewards/format_reward": 0.984375, | |
| "step": 406 | |
| }, | |
| { | |
| "completion_length": 66.5078125, | |
| "epoch": 4.903614457831325, | |
| "grad_norm": 4.123527789276523, | |
| "kl": 0.10986328125, | |
| "learning_rate": 1.827309236947791e-07, | |
| "loss": 0.0044, | |
| "reward": 2.294624924659729, | |
| "reward_std": 0.19924252480268478, | |
| "rewards/accuracy_reward": 1.3024373650550842, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 407 | |
| }, | |
| { | |
| "completion_length": 66.390625, | |
| "epoch": 4.9156626506024095, | |
| "grad_norm": 3.62978164804241, | |
| "kl": 0.12890625, | |
| "learning_rate": 1.8072289156626505e-07, | |
| "loss": 0.0051, | |
| "reward": 2.543404698371887, | |
| "reward_std": 0.1362360306084156, | |
| "rewards/accuracy_reward": 1.5434046983718872, | |
| "rewards/format_reward": 1.0, | |
| "step": 408 | |
| }, | |
| { | |
| "completion_length": 63.9765625, | |
| "epoch": 4.927710843373494, | |
| "grad_norm": 4.35384844886202, | |
| "kl": 0.12890625, | |
| "learning_rate": 1.78714859437751e-07, | |
| "loss": 0.0052, | |
| "reward": 2.418124198913574, | |
| "reward_std": 0.22236012667417526, | |
| "rewards/accuracy_reward": 1.4337490797042847, | |
| "rewards/format_reward": 0.984375, | |
| "step": 409 | |
| }, | |
| { | |
| "completion_length": 68.90625, | |
| "epoch": 4.9397590361445785, | |
| "grad_norm": 5.014972518639089, | |
| "kl": 0.1103515625, | |
| "learning_rate": 1.7670682730923694e-07, | |
| "loss": 0.0044, | |
| "reward": 2.4006751775741577, | |
| "reward_std": 0.16714774072170258, | |
| "rewards/accuracy_reward": 1.4006752967834473, | |
| "rewards/format_reward": 1.0, | |
| "step": 410 | |
| }, | |
| { | |
| "completion_length": 69.59375, | |
| "epoch": 4.951807228915663, | |
| "grad_norm": 7.696032017895469, | |
| "kl": 0.13916015625, | |
| "learning_rate": 1.746987951807229e-07, | |
| "loss": 0.0056, | |
| "reward": 2.395194172859192, | |
| "reward_std": 0.16039493680000305, | |
| "rewards/accuracy_reward": 1.3951941132545471, | |
| "rewards/format_reward": 1.0, | |
| "step": 411 | |
| }, | |
| { | |
| "completion_length": 70.125, | |
| "epoch": 4.9638554216867465, | |
| "grad_norm": 4.628350833888434, | |
| "kl": 0.149169921875, | |
| "learning_rate": 1.7269076305220884e-07, | |
| "loss": 0.006, | |
| "reward": 2.1348607540130615, | |
| "reward_std": 0.1709538996219635, | |
| "rewards/accuracy_reward": 1.1348606944084167, | |
| "rewards/format_reward": 1.0, | |
| "step": 412 | |
| }, | |
| { | |
| "completion_length": 66.2109375, | |
| "epoch": 4.975903614457831, | |
| "grad_norm": 3.188607704812383, | |
| "kl": 0.12646484375, | |
| "learning_rate": 1.706827309236948e-07, | |
| "loss": 0.0051, | |
| "reward": 2.302504062652588, | |
| "reward_std": 0.2623682767152786, | |
| "rewards/accuracy_reward": 1.3181291222572327, | |
| "rewards/format_reward": 0.984375, | |
| "step": 413 | |
| }, | |
| { | |
| "completion_length": 64.171875, | |
| "epoch": 4.9879518072289155, | |
| "grad_norm": 3.9665667179390773, | |
| "kl": 0.128662109375, | |
| "learning_rate": 1.686746987951807e-07, | |
| "loss": 0.0052, | |
| "reward": 2.4097338914871216, | |
| "reward_std": 0.17293449118733406, | |
| "rewards/accuracy_reward": 1.4097338318824768, | |
| "rewards/format_reward": 1.0, | |
| "step": 414 | |
| }, | |
| { | |
| "completion_length": 77.33333587646484, | |
| "epoch": 5.0, | |
| "grad_norm": 3.313170759959086, | |
| "kl": 0.1083984375, | |
| "learning_rate": 1.6666666666666665e-07, | |
| "loss": 0.004, | |
| "reward": 2.2759520411491394, | |
| "reward_std": 0.1403224766254425, | |
| "rewards/accuracy_reward": 1.2759520411491394, | |
| "rewards/format_reward": 1.0, | |
| "step": 415 | |
| }, | |
| { | |
| "completion_length": 66.3203125, | |
| "epoch": 5.0120481927710845, | |
| "grad_norm": 4.277881132595083, | |
| "kl": 0.14306640625, | |
| "learning_rate": 1.646586345381526e-07, | |
| "loss": 0.0057, | |
| "reward": 2.373741865158081, | |
| "reward_std": 0.20744601637125015, | |
| "rewards/accuracy_reward": 1.3815542459487915, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 416 | |
| }, | |
| { | |
| "completion_length": 66.53125, | |
| "epoch": 5.024096385542169, | |
| "grad_norm": 3.9929439696450575, | |
| "kl": 0.12939453125, | |
| "learning_rate": 1.6265060240963853e-07, | |
| "loss": 0.0052, | |
| "reward": 2.35166335105896, | |
| "reward_std": 0.2503097951412201, | |
| "rewards/accuracy_reward": 1.35166335105896, | |
| "rewards/format_reward": 1.0, | |
| "step": 417 | |
| }, | |
| { | |
| "completion_length": 68.625, | |
| "epoch": 5.036144578313253, | |
| "grad_norm": 4.023924792103433, | |
| "kl": 0.114013671875, | |
| "learning_rate": 1.6064257028112448e-07, | |
| "loss": 0.0046, | |
| "reward": 2.2476612329483032, | |
| "reward_std": 0.185993991792202, | |
| "rewards/accuracy_reward": 1.2554737329483032, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 418 | |
| }, | |
| { | |
| "completion_length": 65.7421875, | |
| "epoch": 5.048192771084337, | |
| "grad_norm": 3.5711137415239618, | |
| "kl": 0.134033203125, | |
| "learning_rate": 1.5863453815261044e-07, | |
| "loss": 0.0054, | |
| "reward": 2.2856324911117554, | |
| "reward_std": 0.14102690666913986, | |
| "rewards/accuracy_reward": 1.2856324911117554, | |
| "rewards/format_reward": 1.0, | |
| "step": 419 | |
| }, | |
| { | |
| "completion_length": 65.1328125, | |
| "epoch": 5.0602409638554215, | |
| "grad_norm": 5.8881280705003505, | |
| "kl": 0.1259765625, | |
| "learning_rate": 1.566265060240964e-07, | |
| "loss": 0.005, | |
| "reward": 2.474275588989258, | |
| "reward_std": 0.2030300498008728, | |
| "rewards/accuracy_reward": 1.474275529384613, | |
| "rewards/format_reward": 1.0, | |
| "step": 420 | |
| }, | |
| { | |
| "completion_length": 59.453125, | |
| "epoch": 5.072289156626506, | |
| "grad_norm": 17.487945694806488, | |
| "kl": 0.1279296875, | |
| "learning_rate": 1.5461847389558232e-07, | |
| "loss": 0.0051, | |
| "reward": 2.468233823776245, | |
| "reward_std": 0.17333931475877762, | |
| "rewards/accuracy_reward": 1.4682338237762451, | |
| "rewards/format_reward": 1.0, | |
| "step": 421 | |
| }, | |
| { | |
| "completion_length": 67.7421875, | |
| "epoch": 5.0843373493975905, | |
| "grad_norm": 4.5642738703913865, | |
| "kl": 0.12646484375, | |
| "learning_rate": 1.5261044176706827e-07, | |
| "loss": 0.0051, | |
| "reward": 2.39510977268219, | |
| "reward_std": 0.1837218478322029, | |
| "rewards/accuracy_reward": 1.3951098918914795, | |
| "rewards/format_reward": 1.0, | |
| "step": 422 | |
| }, | |
| { | |
| "completion_length": 64.515625, | |
| "epoch": 5.096385542168675, | |
| "grad_norm": 7.684070732359071, | |
| "kl": 0.139892578125, | |
| "learning_rate": 1.5060240963855423e-07, | |
| "loss": 0.0056, | |
| "reward": 2.16294264793396, | |
| "reward_std": 0.14895135164260864, | |
| "rewards/accuracy_reward": 1.1707550883293152, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 423 | |
| }, | |
| { | |
| "completion_length": 64.46875, | |
| "epoch": 5.108433734939759, | |
| "grad_norm": 3.930344733874979, | |
| "kl": 0.11669921875, | |
| "learning_rate": 1.4859437751004015e-07, | |
| "loss": 0.0047, | |
| "reward": 2.3980486392974854, | |
| "reward_std": 0.15896277129650116, | |
| "rewards/accuracy_reward": 1.3980485796928406, | |
| "rewards/format_reward": 1.0, | |
| "step": 424 | |
| }, | |
| { | |
| "completion_length": 68.875, | |
| "epoch": 5.120481927710843, | |
| "grad_norm": 6.912033255857147, | |
| "kl": 0.118896484375, | |
| "learning_rate": 1.4658634538152608e-07, | |
| "loss": 0.0048, | |
| "reward": 2.4401201009750366, | |
| "reward_std": 0.18969366699457169, | |
| "rewards/accuracy_reward": 1.440119981765747, | |
| "rewards/format_reward": 1.0, | |
| "step": 425 | |
| }, | |
| { | |
| "completion_length": 65.609375, | |
| "epoch": 5.132530120481928, | |
| "grad_norm": 3.6477005267341163, | |
| "kl": 0.1708984375, | |
| "learning_rate": 1.4457831325301203e-07, | |
| "loss": 0.0068, | |
| "reward": 2.300011992454529, | |
| "reward_std": 0.2104162722826004, | |
| "rewards/accuracy_reward": 1.300011932849884, | |
| "rewards/format_reward": 1.0, | |
| "step": 426 | |
| }, | |
| { | |
| "completion_length": 65.0859375, | |
| "epoch": 5.144578313253012, | |
| "grad_norm": 5.390081007205584, | |
| "kl": 0.12548828125, | |
| "learning_rate": 1.42570281124498e-07, | |
| "loss": 0.005, | |
| "reward": 2.407547354698181, | |
| "reward_std": 0.19479839503765106, | |
| "rewards/accuracy_reward": 1.4075472354888916, | |
| "rewards/format_reward": 1.0, | |
| "step": 427 | |
| }, | |
| { | |
| "completion_length": 65.8046875, | |
| "epoch": 5.156626506024097, | |
| "grad_norm": 5.842696773596783, | |
| "kl": 0.12255859375, | |
| "learning_rate": 1.4056224899598394e-07, | |
| "loss": 0.0049, | |
| "reward": 2.2872836589813232, | |
| "reward_std": 0.2501709461212158, | |
| "rewards/accuracy_reward": 1.2950963973999023, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 428 | |
| }, | |
| { | |
| "completion_length": 67.2890625, | |
| "epoch": 5.168674698795181, | |
| "grad_norm": 3.9373211288360612, | |
| "kl": 0.134765625, | |
| "learning_rate": 1.3855421686746987e-07, | |
| "loss": 0.0054, | |
| "reward": 2.4114162921905518, | |
| "reward_std": 0.22173649817705154, | |
| "rewards/accuracy_reward": 1.419228732585907, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 429 | |
| }, | |
| { | |
| "completion_length": 65.7265625, | |
| "epoch": 5.180722891566265, | |
| "grad_norm": 5.989728831260378, | |
| "kl": 0.20263671875, | |
| "learning_rate": 1.3654618473895582e-07, | |
| "loss": 0.0081, | |
| "reward": 2.349661111831665, | |
| "reward_std": 0.24485966563224792, | |
| "rewards/accuracy_reward": 1.3496609926223755, | |
| "rewards/format_reward": 1.0, | |
| "step": 430 | |
| }, | |
| { | |
| "completion_length": 71.0390625, | |
| "epoch": 5.192771084337349, | |
| "grad_norm": 4.9722233041190425, | |
| "kl": 0.11083984375, | |
| "learning_rate": 1.3453815261044177e-07, | |
| "loss": 0.0044, | |
| "reward": 2.423168659210205, | |
| "reward_std": 0.16536322236061096, | |
| "rewards/accuracy_reward": 1.4231685996055603, | |
| "rewards/format_reward": 1.0, | |
| "step": 431 | |
| }, | |
| { | |
| "completion_length": 66.234375, | |
| "epoch": 5.204819277108434, | |
| "grad_norm": 3.5058259130400162, | |
| "kl": 0.1376953125, | |
| "learning_rate": 1.3253012048192773e-07, | |
| "loss": 0.0055, | |
| "reward": 2.2352651357650757, | |
| "reward_std": 0.18688317388296127, | |
| "rewards/accuracy_reward": 1.2352651357650757, | |
| "rewards/format_reward": 1.0, | |
| "step": 432 | |
| }, | |
| { | |
| "completion_length": 72.8203125, | |
| "epoch": 5.216867469879518, | |
| "grad_norm": 3.8748331360003485, | |
| "kl": 0.130859375, | |
| "learning_rate": 1.3052208835341366e-07, | |
| "loss": 0.0052, | |
| "reward": 2.3151748180389404, | |
| "reward_std": 0.21110112965106964, | |
| "rewards/accuracy_reward": 1.3229871988296509, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 433 | |
| }, | |
| { | |
| "completion_length": 68.8671875, | |
| "epoch": 5.228915662650603, | |
| "grad_norm": 3.985332448415374, | |
| "kl": 0.1220703125, | |
| "learning_rate": 1.2851405622489958e-07, | |
| "loss": 0.0049, | |
| "reward": 2.26615047454834, | |
| "reward_std": 0.20259422063827515, | |
| "rewards/accuracy_reward": 1.2739630937576294, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 434 | |
| }, | |
| { | |
| "completion_length": 64.0234375, | |
| "epoch": 5.240963855421687, | |
| "grad_norm": 4.209088113123041, | |
| "kl": 0.119873046875, | |
| "learning_rate": 1.2650602409638554e-07, | |
| "loss": 0.0048, | |
| "reward": 2.345677137374878, | |
| "reward_std": 0.16655350476503372, | |
| "rewards/accuracy_reward": 1.345677137374878, | |
| "rewards/format_reward": 1.0, | |
| "step": 435 | |
| }, | |
| { | |
| "completion_length": 72.2109375, | |
| "epoch": 5.253012048192771, | |
| "grad_norm": 3.7180924645581994, | |
| "kl": 0.13427734375, | |
| "learning_rate": 1.2449799196787146e-07, | |
| "loss": 0.0054, | |
| "reward": 2.163213849067688, | |
| "reward_std": 0.3149610310792923, | |
| "rewards/accuracy_reward": 1.1866515278816223, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 436 | |
| }, | |
| { | |
| "completion_length": 65.328125, | |
| "epoch": 5.265060240963855, | |
| "grad_norm": 3.8280472693841556, | |
| "kl": 0.12744140625, | |
| "learning_rate": 1.2248995983935742e-07, | |
| "loss": 0.0051, | |
| "reward": 2.3446794748306274, | |
| "reward_std": 0.22430174052715302, | |
| "rewards/accuracy_reward": 1.3446794152259827, | |
| "rewards/format_reward": 1.0, | |
| "step": 437 | |
| }, | |
| { | |
| "completion_length": 64.65625, | |
| "epoch": 5.27710843373494, | |
| "grad_norm": 5.861122122648032, | |
| "kl": 0.12060546875, | |
| "learning_rate": 1.2048192771084337e-07, | |
| "loss": 0.0048, | |
| "reward": 2.379356861114502, | |
| "reward_std": 0.1506607085466385, | |
| "rewards/accuracy_reward": 1.3871691226959229, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 438 | |
| }, | |
| { | |
| "completion_length": 71.1171875, | |
| "epoch": 5.289156626506024, | |
| "grad_norm": 3.8119653679452092, | |
| "kl": 0.12353515625, | |
| "learning_rate": 1.1847389558232931e-07, | |
| "loss": 0.0049, | |
| "reward": 2.388357400894165, | |
| "reward_std": 0.23687779903411865, | |
| "rewards/accuracy_reward": 1.3961697816848755, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 439 | |
| }, | |
| { | |
| "completion_length": 72.3515625, | |
| "epoch": 5.301204819277109, | |
| "grad_norm": 3.9178115284886372, | |
| "kl": 0.095458984375, | |
| "learning_rate": 1.1646586345381526e-07, | |
| "loss": 0.0038, | |
| "reward": 2.6513583660125732, | |
| "reward_std": 0.17830242216587067, | |
| "rewards/accuracy_reward": 1.6513583660125732, | |
| "rewards/format_reward": 1.0, | |
| "step": 440 | |
| }, | |
| { | |
| "completion_length": 68.921875, | |
| "epoch": 5.313253012048193, | |
| "grad_norm": 4.623442869387058, | |
| "kl": 0.100830078125, | |
| "learning_rate": 1.1445783132530119e-07, | |
| "loss": 0.004, | |
| "reward": 2.549654483795166, | |
| "reward_std": 0.16079290956258774, | |
| "rewards/accuracy_reward": 1.5574671030044556, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 441 | |
| }, | |
| { | |
| "completion_length": 71.3203125, | |
| "epoch": 5.325301204819277, | |
| "grad_norm": 5.278895722638805, | |
| "kl": 0.10986328125, | |
| "learning_rate": 1.1244979919678714e-07, | |
| "loss": 0.0044, | |
| "reward": 2.203883409500122, | |
| "reward_std": 0.258064404129982, | |
| "rewards/accuracy_reward": 1.2116957902908325, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 442 | |
| }, | |
| { | |
| "completion_length": 69.515625, | |
| "epoch": 5.337349397590361, | |
| "grad_norm": 4.142710717599773, | |
| "kl": 0.113525390625, | |
| "learning_rate": 1.1044176706827308e-07, | |
| "loss": 0.0045, | |
| "reward": 2.1769516468048096, | |
| "reward_std": 0.275626465678215, | |
| "rewards/accuracy_reward": 1.1769516468048096, | |
| "rewards/format_reward": 1.0, | |
| "step": 443 | |
| }, | |
| { | |
| "completion_length": 68.3203125, | |
| "epoch": 5.349397590361446, | |
| "grad_norm": 4.180078412016221, | |
| "kl": 0.147216796875, | |
| "learning_rate": 1.0843373493975904e-07, | |
| "loss": 0.0059, | |
| "reward": 2.381720542907715, | |
| "reward_std": 0.20287376642227173, | |
| "rewards/accuracy_reward": 1.3817205429077148, | |
| "rewards/format_reward": 1.0, | |
| "step": 444 | |
| }, | |
| { | |
| "completion_length": 69.7421875, | |
| "epoch": 5.36144578313253, | |
| "grad_norm": 3.7523897150785603, | |
| "kl": 0.12939453125, | |
| "learning_rate": 1.0642570281124498e-07, | |
| "loss": 0.0052, | |
| "reward": 2.3669261932373047, | |
| "reward_std": 0.2056456208229065, | |
| "rewards/accuracy_reward": 1.3747385740280151, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 445 | |
| }, | |
| { | |
| "completion_length": 67.7109375, | |
| "epoch": 5.373493975903615, | |
| "grad_norm": 4.924758819089559, | |
| "kl": 0.185546875, | |
| "learning_rate": 1.0441767068273092e-07, | |
| "loss": 0.0074, | |
| "reward": 2.4100332260131836, | |
| "reward_std": 0.22913093864917755, | |
| "rewards/accuracy_reward": 1.4178457260131836, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 446 | |
| }, | |
| { | |
| "completion_length": 69.1875, | |
| "epoch": 5.385542168674699, | |
| "grad_norm": 3.080626056952063, | |
| "kl": 0.122314453125, | |
| "learning_rate": 1.0240963855421686e-07, | |
| "loss": 0.0049, | |
| "reward": 2.3073067665100098, | |
| "reward_std": 0.23586007952690125, | |
| "rewards/accuracy_reward": 1.315119206905365, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 447 | |
| }, | |
| { | |
| "completion_length": 67.59375, | |
| "epoch": 5.397590361445783, | |
| "grad_norm": 3.8573400804993314, | |
| "kl": 0.128662109375, | |
| "learning_rate": 1.0040160642570281e-07, | |
| "loss": 0.0051, | |
| "reward": 2.2195699214935303, | |
| "reward_std": 0.18059836328029633, | |
| "rewards/accuracy_reward": 1.2195698618888855, | |
| "rewards/format_reward": 1.0, | |
| "step": 448 | |
| }, | |
| { | |
| "completion_length": 65.0078125, | |
| "epoch": 5.409638554216867, | |
| "grad_norm": 9.729377045307634, | |
| "kl": 0.110107421875, | |
| "learning_rate": 9.839357429718875e-08, | |
| "loss": 0.0044, | |
| "reward": 2.335146427154541, | |
| "reward_std": 0.20962534099817276, | |
| "rewards/accuracy_reward": 1.3429590463638306, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 449 | |
| }, | |
| { | |
| "completion_length": 76.171875, | |
| "epoch": 5.421686746987952, | |
| "grad_norm": 5.139417091846479, | |
| "kl": 0.17626953125, | |
| "learning_rate": 9.638554216867469e-08, | |
| "loss": 0.0071, | |
| "reward": 2.2514326572418213, | |
| "reward_std": 0.18450473248958588, | |
| "rewards/accuracy_reward": 1.2592450976371765, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 450 | |
| }, | |
| { | |
| "completion_length": 68.046875, | |
| "epoch": 5.433734939759036, | |
| "grad_norm": 3.961385062957452, | |
| "kl": 0.10693359375, | |
| "learning_rate": 9.437751004016063e-08, | |
| "loss": 0.0043, | |
| "reward": 2.328533172607422, | |
| "reward_std": 0.18290965259075165, | |
| "rewards/accuracy_reward": 1.3285331726074219, | |
| "rewards/format_reward": 1.0, | |
| "step": 451 | |
| }, | |
| { | |
| "completion_length": 68.6953125, | |
| "epoch": 5.445783132530121, | |
| "grad_norm": 4.887519681333338, | |
| "kl": 0.103759765625, | |
| "learning_rate": 9.236947791164657e-08, | |
| "loss": 0.0042, | |
| "reward": 2.3144426345825195, | |
| "reward_std": 0.21034369617700577, | |
| "rewards/accuracy_reward": 1.3144426941871643, | |
| "rewards/format_reward": 1.0, | |
| "step": 452 | |
| }, | |
| { | |
| "completion_length": 68.0, | |
| "epoch": 5.457831325301205, | |
| "grad_norm": 3.80893967356862, | |
| "kl": 0.127685546875, | |
| "learning_rate": 9.036144578313253e-08, | |
| "loss": 0.0051, | |
| "reward": 2.4345412254333496, | |
| "reward_std": 0.2006332352757454, | |
| "rewards/accuracy_reward": 1.4345412254333496, | |
| "rewards/format_reward": 1.0, | |
| "step": 453 | |
| }, | |
| { | |
| "completion_length": 67.046875, | |
| "epoch": 5.469879518072289, | |
| "grad_norm": 4.2954066473287815, | |
| "kl": 0.12841796875, | |
| "learning_rate": 8.835341365461847e-08, | |
| "loss": 0.0052, | |
| "reward": 2.353352427482605, | |
| "reward_std": 0.22566306591033936, | |
| "rewards/accuracy_reward": 1.353352427482605, | |
| "rewards/format_reward": 1.0, | |
| "step": 454 | |
| }, | |
| { | |
| "completion_length": 64.8984375, | |
| "epoch": 5.481927710843373, | |
| "grad_norm": 4.546803918905019, | |
| "kl": 0.1337890625, | |
| "learning_rate": 8.634538152610442e-08, | |
| "loss": 0.0054, | |
| "reward": 2.3113902807235718, | |
| "reward_std": 0.20004340261220932, | |
| "rewards/accuracy_reward": 1.3192027807235718, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 455 | |
| }, | |
| { | |
| "completion_length": 66.1640625, | |
| "epoch": 5.493975903614458, | |
| "grad_norm": 3.5466190382737883, | |
| "kl": 0.123046875, | |
| "learning_rate": 8.433734939759035e-08, | |
| "loss": 0.0049, | |
| "reward": 2.3270002603530884, | |
| "reward_std": 0.21506989747285843, | |
| "rewards/accuracy_reward": 1.3270001411437988, | |
| "rewards/format_reward": 1.0, | |
| "step": 456 | |
| }, | |
| { | |
| "completion_length": 72.3984375, | |
| "epoch": 5.506024096385542, | |
| "grad_norm": 5.213818604387868, | |
| "kl": 0.1328125, | |
| "learning_rate": 8.23293172690763e-08, | |
| "loss": 0.0053, | |
| "reward": 2.4117329120635986, | |
| "reward_std": 0.21075783669948578, | |
| "rewards/accuracy_reward": 1.411732792854309, | |
| "rewards/format_reward": 1.0, | |
| "step": 457 | |
| }, | |
| { | |
| "completion_length": 63.4140625, | |
| "epoch": 5.518072289156627, | |
| "grad_norm": 4.087135154378612, | |
| "kl": 0.1142578125, | |
| "learning_rate": 8.032128514056224e-08, | |
| "loss": 0.0046, | |
| "reward": 2.2361518144607544, | |
| "reward_std": 0.15534771978855133, | |
| "rewards/accuracy_reward": 1.2361518740653992, | |
| "rewards/format_reward": 1.0, | |
| "step": 458 | |
| }, | |
| { | |
| "completion_length": 66.6796875, | |
| "epoch": 5.530120481927711, | |
| "grad_norm": 3.8509871084036083, | |
| "kl": 0.12255859375, | |
| "learning_rate": 7.83132530120482e-08, | |
| "loss": 0.0049, | |
| "reward": 2.402904510498047, | |
| "reward_std": 0.18761365860700607, | |
| "rewards/accuracy_reward": 1.4029043912887573, | |
| "rewards/format_reward": 1.0, | |
| "step": 459 | |
| }, | |
| { | |
| "completion_length": 67.921875, | |
| "epoch": 5.542168674698795, | |
| "grad_norm": 3.8868143152174714, | |
| "kl": 0.1201171875, | |
| "learning_rate": 7.630522088353414e-08, | |
| "loss": 0.0048, | |
| "reward": 2.202209234237671, | |
| "reward_std": 0.20886321365833282, | |
| "rewards/accuracy_reward": 1.2022093534469604, | |
| "rewards/format_reward": 1.0, | |
| "step": 460 | |
| }, | |
| { | |
| "completion_length": 69.84375, | |
| "epoch": 5.554216867469879, | |
| "grad_norm": 9.828452094441177, | |
| "kl": 0.138427734375, | |
| "learning_rate": 7.429718875502008e-08, | |
| "loss": 0.0055, | |
| "reward": 2.255289673805237, | |
| "reward_std": 0.3091956526041031, | |
| "rewards/accuracy_reward": 1.2787271738052368, | |
| "rewards/format_reward": 0.9765625, | |
| "step": 461 | |
| }, | |
| { | |
| "completion_length": 67.7265625, | |
| "epoch": 5.566265060240964, | |
| "grad_norm": 3.5884325923981777, | |
| "kl": 0.14501953125, | |
| "learning_rate": 7.228915662650602e-08, | |
| "loss": 0.0058, | |
| "reward": 2.389763116836548, | |
| "reward_std": 0.1989041194319725, | |
| "rewards/accuracy_reward": 1.3897631168365479, | |
| "rewards/format_reward": 1.0, | |
| "step": 462 | |
| }, | |
| { | |
| "completion_length": 63.4765625, | |
| "epoch": 5.578313253012048, | |
| "grad_norm": 3.943165256338966, | |
| "kl": 0.15185546875, | |
| "learning_rate": 7.028112449799197e-08, | |
| "loss": 0.0061, | |
| "reward": 2.2263519763946533, | |
| "reward_std": 0.22419632971286774, | |
| "rewards/accuracy_reward": 1.2341644763946533, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 463 | |
| }, | |
| { | |
| "completion_length": 67.734375, | |
| "epoch": 5.590361445783133, | |
| "grad_norm": 8.892123036444877, | |
| "kl": 0.126953125, | |
| "learning_rate": 6.827309236947791e-08, | |
| "loss": 0.0051, | |
| "reward": 2.3126423358917236, | |
| "reward_std": 0.17722339183092117, | |
| "rewards/accuracy_reward": 1.3126422762870789, | |
| "rewards/format_reward": 1.0, | |
| "step": 464 | |
| }, | |
| { | |
| "completion_length": 75.5546875, | |
| "epoch": 5.602409638554217, | |
| "grad_norm": 4.229071556328315, | |
| "kl": 0.1240234375, | |
| "learning_rate": 6.626506024096386e-08, | |
| "loss": 0.005, | |
| "reward": 2.2280049324035645, | |
| "reward_std": 0.22474994510412216, | |
| "rewards/accuracy_reward": 1.235817551612854, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 465 | |
| }, | |
| { | |
| "completion_length": 66.9609375, | |
| "epoch": 5.614457831325301, | |
| "grad_norm": 4.577684554062664, | |
| "kl": 0.12451171875, | |
| "learning_rate": 6.425702811244979e-08, | |
| "loss": 0.005, | |
| "reward": 2.2235909700393677, | |
| "reward_std": 0.22441789507865906, | |
| "rewards/accuracy_reward": 1.2392158508300781, | |
| "rewards/format_reward": 0.984375, | |
| "step": 466 | |
| }, | |
| { | |
| "completion_length": 70.4375, | |
| "epoch": 5.626506024096385, | |
| "grad_norm": 4.349159327486559, | |
| "kl": 0.112548828125, | |
| "learning_rate": 6.224899598393573e-08, | |
| "loss": 0.0045, | |
| "reward": 2.3591808080673218, | |
| "reward_std": 0.1966349333524704, | |
| "rewards/accuracy_reward": 1.3669933080673218, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 467 | |
| }, | |
| { | |
| "completion_length": 69.4453125, | |
| "epoch": 5.63855421686747, | |
| "grad_norm": 3.0423100870405437, | |
| "kl": 0.138671875, | |
| "learning_rate": 6.024096385542168e-08, | |
| "loss": 0.0055, | |
| "reward": 2.4168301820755005, | |
| "reward_std": 0.23313428461551666, | |
| "rewards/accuracy_reward": 1.4246427416801453, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 468 | |
| }, | |
| { | |
| "completion_length": 67.9453125, | |
| "epoch": 5.650602409638554, | |
| "grad_norm": 4.8492295392656075, | |
| "kl": 0.124755859375, | |
| "learning_rate": 5.823293172690763e-08, | |
| "loss": 0.005, | |
| "reward": 2.3264076709747314, | |
| "reward_std": 0.18676774948835373, | |
| "rewards/accuracy_reward": 1.3264076709747314, | |
| "rewards/format_reward": 1.0, | |
| "step": 469 | |
| }, | |
| { | |
| "completion_length": 68.3984375, | |
| "epoch": 5.662650602409639, | |
| "grad_norm": 3.7143887896006706, | |
| "kl": 0.118896484375, | |
| "learning_rate": 5.622489959839357e-08, | |
| "loss": 0.0048, | |
| "reward": 2.275146722793579, | |
| "reward_std": 0.23441863059997559, | |
| "rewards/accuracy_reward": 1.2907716631889343, | |
| "rewards/format_reward": 0.984375, | |
| "step": 470 | |
| }, | |
| { | |
| "completion_length": 69.703125, | |
| "epoch": 5.674698795180722, | |
| "grad_norm": 6.421818895030251, | |
| "kl": 0.105712890625, | |
| "learning_rate": 5.421686746987952e-08, | |
| "loss": 0.0042, | |
| "reward": 2.3713172674179077, | |
| "reward_std": 0.17046835273504257, | |
| "rewards/accuracy_reward": 1.3713172674179077, | |
| "rewards/format_reward": 1.0, | |
| "step": 471 | |
| }, | |
| { | |
| "completion_length": 71.7578125, | |
| "epoch": 5.686746987951807, | |
| "grad_norm": 3.7429303333646846, | |
| "kl": 0.17333984375, | |
| "learning_rate": 5.220883534136546e-08, | |
| "loss": 0.0069, | |
| "reward": 2.21248197555542, | |
| "reward_std": 0.1897253841161728, | |
| "rewards/accuracy_reward": 1.2202943563461304, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 472 | |
| }, | |
| { | |
| "completion_length": 66.0625, | |
| "epoch": 5.698795180722891, | |
| "grad_norm": 4.6125292648898375, | |
| "kl": 0.1171875, | |
| "learning_rate": 5.0200803212851406e-08, | |
| "loss": 0.0047, | |
| "reward": 2.3862085342407227, | |
| "reward_std": 0.14106625318527222, | |
| "rewards/accuracy_reward": 1.3940210938453674, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 473 | |
| }, | |
| { | |
| "completion_length": 71.4296875, | |
| "epoch": 5.710843373493976, | |
| "grad_norm": 4.192704287374918, | |
| "kl": 0.108642578125, | |
| "learning_rate": 4.8192771084337347e-08, | |
| "loss": 0.0043, | |
| "reward": 2.3476767539978027, | |
| "reward_std": 0.20362288504838943, | |
| "rewards/accuracy_reward": 1.3476767539978027, | |
| "rewards/format_reward": 1.0, | |
| "step": 474 | |
| }, | |
| { | |
| "completion_length": 67.2109375, | |
| "epoch": 5.72289156626506, | |
| "grad_norm": 4.1447657242460645, | |
| "kl": 0.1298828125, | |
| "learning_rate": 4.618473895582329e-08, | |
| "loss": 0.0052, | |
| "reward": 2.266420602798462, | |
| "reward_std": 0.2129717692732811, | |
| "rewards/accuracy_reward": 1.2664207220077515, | |
| "rewards/format_reward": 1.0, | |
| "step": 475 | |
| }, | |
| { | |
| "completion_length": 66.546875, | |
| "epoch": 5.734939759036145, | |
| "grad_norm": 3.4345215566799574, | |
| "kl": 0.106201171875, | |
| "learning_rate": 4.4176706827309234e-08, | |
| "loss": 0.0042, | |
| "reward": 2.352730870246887, | |
| "reward_std": 0.1454787813127041, | |
| "rewards/accuracy_reward": 1.3605434894561768, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 476 | |
| }, | |
| { | |
| "completion_length": 71.828125, | |
| "epoch": 5.746987951807229, | |
| "grad_norm": 4.187659893839478, | |
| "kl": 0.111328125, | |
| "learning_rate": 4.2168674698795174e-08, | |
| "loss": 0.0045, | |
| "reward": 2.2670211791992188, | |
| "reward_std": 0.22116923332214355, | |
| "rewards/accuracy_reward": 1.267021119594574, | |
| "rewards/format_reward": 1.0, | |
| "step": 477 | |
| }, | |
| { | |
| "completion_length": 69.1875, | |
| "epoch": 5.759036144578313, | |
| "grad_norm": 3.8623536023281617, | |
| "kl": 0.114013671875, | |
| "learning_rate": 4.016064257028112e-08, | |
| "loss": 0.0046, | |
| "reward": 2.222132921218872, | |
| "reward_std": 0.23479964584112167, | |
| "rewards/accuracy_reward": 1.2221328020095825, | |
| "rewards/format_reward": 1.0, | |
| "step": 478 | |
| }, | |
| { | |
| "completion_length": 70.9296875, | |
| "epoch": 5.771084337349397, | |
| "grad_norm": 4.262446208684037, | |
| "kl": 0.09375, | |
| "learning_rate": 3.815261044176707e-08, | |
| "loss": 0.0037, | |
| "reward": 2.2334243059158325, | |
| "reward_std": 0.21778832376003265, | |
| "rewards/accuracy_reward": 1.2334243059158325, | |
| "rewards/format_reward": 1.0, | |
| "step": 479 | |
| }, | |
| { | |
| "completion_length": 68.2421875, | |
| "epoch": 5.783132530120482, | |
| "grad_norm": 3.475197673617196, | |
| "kl": 0.10595703125, | |
| "learning_rate": 3.614457831325301e-08, | |
| "loss": 0.0042, | |
| "reward": 2.4461944103240967, | |
| "reward_std": 0.21106188744306564, | |
| "rewards/accuracy_reward": 1.4540069103240967, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 480 | |
| }, | |
| { | |
| "completion_length": 70.3671875, | |
| "epoch": 5.795180722891566, | |
| "grad_norm": 4.56883704942929, | |
| "kl": 0.11865234375, | |
| "learning_rate": 3.4136546184738955e-08, | |
| "loss": 0.0047, | |
| "reward": 2.441108226776123, | |
| "reward_std": 0.2091435343027115, | |
| "rewards/accuracy_reward": 1.441108226776123, | |
| "rewards/format_reward": 1.0, | |
| "step": 481 | |
| }, | |
| { | |
| "completion_length": 69.171875, | |
| "epoch": 5.807228915662651, | |
| "grad_norm": 3.959761896565078, | |
| "kl": 0.12451171875, | |
| "learning_rate": 3.2128514056224896e-08, | |
| "loss": 0.005, | |
| "reward": 2.3847368955612183, | |
| "reward_std": 0.14646587148308754, | |
| "rewards/accuracy_reward": 1.3847368359565735, | |
| "rewards/format_reward": 1.0, | |
| "step": 482 | |
| }, | |
| { | |
| "completion_length": 75.3125, | |
| "epoch": 5.8192771084337345, | |
| "grad_norm": 4.6238410926161855, | |
| "kl": 0.108642578125, | |
| "learning_rate": 3.012048192771084e-08, | |
| "loss": 0.0043, | |
| "reward": 2.2356351613998413, | |
| "reward_std": 0.3032216280698776, | |
| "rewards/accuracy_reward": 1.2434476613998413, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 483 | |
| }, | |
| { | |
| "completion_length": 70.921875, | |
| "epoch": 5.831325301204819, | |
| "grad_norm": 4.963499305554948, | |
| "kl": 0.082275390625, | |
| "learning_rate": 2.8112449799196786e-08, | |
| "loss": 0.0033, | |
| "reward": 2.3230150938034058, | |
| "reward_std": 0.16892920434474945, | |
| "rewards/accuracy_reward": 1.3230149745941162, | |
| "rewards/format_reward": 1.0, | |
| "step": 484 | |
| }, | |
| { | |
| "completion_length": 69.3359375, | |
| "epoch": 5.843373493975903, | |
| "grad_norm": 4.069771837808966, | |
| "kl": 0.1396484375, | |
| "learning_rate": 2.610441767068273e-08, | |
| "loss": 0.0056, | |
| "reward": 2.327863335609436, | |
| "reward_std": 0.23238816112279892, | |
| "rewards/accuracy_reward": 1.3434883952140808, | |
| "rewards/format_reward": 0.984375, | |
| "step": 485 | |
| }, | |
| { | |
| "completion_length": 68.875, | |
| "epoch": 5.855421686746988, | |
| "grad_norm": 4.471391988945464, | |
| "kl": 0.13330078125, | |
| "learning_rate": 2.4096385542168673e-08, | |
| "loss": 0.0053, | |
| "reward": 2.331111192703247, | |
| "reward_std": 0.1987084299325943, | |
| "rewards/accuracy_reward": 1.3389237523078918, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 486 | |
| }, | |
| { | |
| "completion_length": 72.2734375, | |
| "epoch": 5.867469879518072, | |
| "grad_norm": 4.3661266337784514, | |
| "kl": 0.128173828125, | |
| "learning_rate": 2.2088353413654617e-08, | |
| "loss": 0.0051, | |
| "reward": 2.2740135192871094, | |
| "reward_std": 0.17679665982723236, | |
| "rewards/accuracy_reward": 1.2740132808685303, | |
| "rewards/format_reward": 1.0, | |
| "step": 487 | |
| }, | |
| { | |
| "completion_length": 69.328125, | |
| "epoch": 5.879518072289157, | |
| "grad_norm": 4.78815312664634, | |
| "kl": 0.150634765625, | |
| "learning_rate": 2.008032128514056e-08, | |
| "loss": 0.006, | |
| "reward": 2.2422866821289062, | |
| "reward_std": 0.23693696409463882, | |
| "rewards/accuracy_reward": 1.2422866821289062, | |
| "rewards/format_reward": 1.0, | |
| "step": 488 | |
| }, | |
| { | |
| "completion_length": 71.4140625, | |
| "epoch": 5.891566265060241, | |
| "grad_norm": 6.245102077972556, | |
| "kl": 0.121826171875, | |
| "learning_rate": 1.8072289156626504e-08, | |
| "loss": 0.0049, | |
| "reward": 2.315194010734558, | |
| "reward_std": 0.1885218769311905, | |
| "rewards/accuracy_reward": 1.3230066299438477, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 489 | |
| }, | |
| { | |
| "completion_length": 63.8984375, | |
| "epoch": 5.903614457831325, | |
| "grad_norm": 4.510763484461414, | |
| "kl": 0.122314453125, | |
| "learning_rate": 1.6064257028112448e-08, | |
| "loss": 0.0049, | |
| "reward": 2.3149102926254272, | |
| "reward_std": 0.1639706939458847, | |
| "rewards/accuracy_reward": 1.3149102926254272, | |
| "rewards/format_reward": 1.0, | |
| "step": 490 | |
| }, | |
| { | |
| "completion_length": 66.0, | |
| "epoch": 5.9156626506024095, | |
| "grad_norm": 4.091329557372317, | |
| "kl": 0.1435546875, | |
| "learning_rate": 1.4056224899598393e-08, | |
| "loss": 0.0058, | |
| "reward": 2.4370064735412598, | |
| "reward_std": 0.15971215814352036, | |
| "rewards/accuracy_reward": 1.4370064735412598, | |
| "rewards/format_reward": 1.0, | |
| "step": 491 | |
| }, | |
| { | |
| "completion_length": 70.484375, | |
| "epoch": 5.927710843373494, | |
| "grad_norm": 4.3856574896033305, | |
| "kl": 0.155029296875, | |
| "learning_rate": 1.2048192771084337e-08, | |
| "loss": 0.0062, | |
| "reward": 2.351839542388916, | |
| "reward_std": 0.2616487815976143, | |
| "rewards/accuracy_reward": 1.359652042388916, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 492 | |
| }, | |
| { | |
| "completion_length": 74.171875, | |
| "epoch": 5.9397590361445785, | |
| "grad_norm": 3.3373281083458974, | |
| "kl": 0.107177734375, | |
| "learning_rate": 1.004016064257028e-08, | |
| "loss": 0.0043, | |
| "reward": 2.3034894466400146, | |
| "reward_std": 0.12144535779953003, | |
| "rewards/accuracy_reward": 1.3113019466400146, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 493 | |
| }, | |
| { | |
| "completion_length": 72.8515625, | |
| "epoch": 5.951807228915663, | |
| "grad_norm": 3.3157754210190773, | |
| "kl": 0.097412109375, | |
| "learning_rate": 8.032128514056224e-09, | |
| "loss": 0.0039, | |
| "reward": 2.421133041381836, | |
| "reward_std": 0.16620434820652008, | |
| "rewards/accuracy_reward": 1.421133041381836, | |
| "rewards/format_reward": 1.0, | |
| "step": 494 | |
| }, | |
| { | |
| "completion_length": 76.1328125, | |
| "epoch": 5.9638554216867465, | |
| "grad_norm": 3.788575194538334, | |
| "kl": 0.12158203125, | |
| "learning_rate": 6.024096385542168e-09, | |
| "loss": 0.0049, | |
| "reward": 2.3588104248046875, | |
| "reward_std": 0.1766229048371315, | |
| "rewards/accuracy_reward": 1.358810544013977, | |
| "rewards/format_reward": 1.0, | |
| "step": 495 | |
| }, | |
| { | |
| "completion_length": 71.515625, | |
| "epoch": 5.975903614457831, | |
| "grad_norm": 4.2730966058785835, | |
| "kl": 0.11962890625, | |
| "learning_rate": 4.016064257028112e-09, | |
| "loss": 0.0048, | |
| "reward": 2.3155951499938965, | |
| "reward_std": 0.25304850190877914, | |
| "rewards/accuracy_reward": 1.3234076499938965, | |
| "rewards/format_reward": 0.9921875, | |
| "step": 496 | |
| }, | |
| { | |
| "completion_length": 68.859375, | |
| "epoch": 5.9879518072289155, | |
| "grad_norm": 4.371956801820215, | |
| "kl": 0.119140625, | |
| "learning_rate": 2.008032128514056e-09, | |
| "loss": 0.0048, | |
| "reward": 2.3737374544143677, | |
| "reward_std": 0.20605729520320892, | |
| "rewards/accuracy_reward": 1.373737394809723, | |
| "rewards/format_reward": 1.0, | |
| "step": 497 | |
| }, | |
| { | |
| "completion_length": 60.75000190734863, | |
| "epoch": 6.0, | |
| "grad_norm": 3.9720317304626964, | |
| "kl": 0.1171875, | |
| "learning_rate": 0.0, | |
| "loss": 0.0046, | |
| "reward": 2.4247955083847046, | |
| "reward_std": 0.17968511581420898, | |
| "rewards/accuracy_reward": 1.4247953295707703, | |
| "rewards/format_reward": 1.0, | |
| "step": 498 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 498, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 6, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |