{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0903426791277258, "eval_steps": 500, "global_step": 700, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 1512.65478515625, "completions/mean_terminated_length": 1512.65478515625, "completions/min_length": 940.0, "completions/min_terminated_length": 940.0, "epoch": 0.001557632398753894, "grad_norm": 0.602738082408905, "kl": -8.884206703640984e-10, "learning_rate": 0.0, "loss": 0.02, "num_tokens": 133045.0, "reward": 1.3617119789123535, "reward_std": 0.09446237236261368, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.36171185970306396, "rewards/correct_reward_func/std": 0.15946270525455475, "step": 1 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 1677.65478515625, "completions/mean_terminated_length": 1518.7681884765625, "completions/min_length": 690.0, "completions/min_terminated_length": 690.0, "epoch": 0.003115264797507788, "grad_norm": 0.5372695922851562, "kl": -8.036803722522023e-10, "learning_rate": 2e-07, "loss": 0.0986, "num_tokens": 279938.0, "reward": 1.3327711820602417, "reward_std": 0.11337035149335861, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3327711820602417, "rewards/correct_reward_func/std": 0.14508673548698425, "step": 2 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 1559.9881591796875, "completions/mean_terminated_length": 1480.084228515625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.004672897196261682, "grad_norm": 0.5770987868309021, "kl": 0.0008140590216498822, "learning_rate": 4e-07, "loss": 0.0348, "num_tokens": 417181.0, "reward": 1.3511351346969604, "reward_std": 0.12009123712778091, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.35113492608070374, "rewards/correct_reward_func/std": 0.16792196035385132, "step": 3 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2638.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 1605.8929443359375, "completions/mean_terminated_length": 1605.8929443359375, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.006230529595015576, "grad_norm": 0.5473058223724365, "kl": 0.0007717256958130747, "learning_rate": 6e-07, "loss": 0.0022, "num_tokens": 557962.0, "reward": 1.3706098794937134, "reward_std": 0.13414135575294495, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.37060973048210144, "rewards/correct_reward_func/std": 0.1871974617242813, "step": 4 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 1513.59521484375, "completions/mean_terminated_length": 1433.1324462890625, "completions/min_length": 314.0, "completions/min_terminated_length": 314.0, "epoch": 0.00778816199376947, "grad_norm": 0.5818154811859131, "kl": 0.0007767349597997963, "learning_rate": 8e-07, "loss": 0.0841, "num_tokens": 690954.0, "reward": 1.2946425676345825, "reward_std": 0.20980872213840485, "rewards/contains_chinese/mean": 0.9523809552192688, "rewards/contains_chinese/std": 0.21423791348934174, "rewards/correct_reward_func/mean": 0.342261403799057, "rewards/correct_reward_func/std": 0.15122981369495392, "step": 5 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 1510.3214111328125, "completions/mean_terminated_length": 1510.3214111328125, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 0.009345794392523364, "grad_norm": 0.5755687355995178, "kl": 0.0008145314350258559, "learning_rate": 1e-06, "loss": 0.0469, "num_tokens": 823905.0, "reward": 1.4106093645095825, "reward_std": 0.12289178371429443, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.42251405119895935, "rewards/correct_reward_func/std": 0.1738594025373459, "step": 6 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 1465.047607421875, "completions/mean_terminated_length": 1465.047607421875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.010903426791277258, "grad_norm": 0.6340458393096924, "kl": 0.0008423295512329787, "learning_rate": 1.2e-06, "loss": -0.0223, "num_tokens": 952801.0, "reward": 1.3219488859176636, "reward_std": 0.14029237627983093, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.3338535726070404, "rewards/correct_reward_func/std": 0.13986904919147491, "step": 7 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2268.0, "completions/max_terminated_length": 2268.0, "completions/mean_length": 1477.011962890625, "completions/mean_terminated_length": 1477.011962890625, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 0.012461059190031152, "grad_norm": 0.5733258128166199, "kl": 0.0008218956645578146, "learning_rate": 1.4e-06, "loss": -0.0078, "num_tokens": 1082942.0, "reward": 1.3452098369598389, "reward_std": 0.09600555151700974, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3452097773551941, "rewards/correct_reward_func/std": 0.13662667572498322, "step": 8 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2478.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 1486.7261962890625, "completions/mean_terminated_length": 1486.7261962890625, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.014018691588785047, "grad_norm": 0.5680725574493408, "kl": 0.0008863781113177538, "learning_rate": 1.6e-06, "loss": -0.0153, "num_tokens": 1213953.0, "reward": 1.3956345319747925, "reward_std": 0.13600240647792816, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.39563441276550293, "rewards/correct_reward_func/std": 0.183233380317688, "step": 9 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2198.0, "completions/max_terminated_length": 2198.0, "completions/mean_length": 1473.6429443359375, "completions/mean_terminated_length": 1473.6429443359375, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.01557632398753894, "grad_norm": 0.5650473833084106, "kl": 0.0009199154155794531, "learning_rate": 1.8e-06, "loss": 0.0136, "num_tokens": 1343673.0, "reward": 1.3652774095535278, "reward_std": 0.0817384421825409, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.36527732014656067, "rewards/correct_reward_func/std": 0.14138561487197876, "step": 10 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2230.0, "completions/mean_length": 1687.2857666015625, "completions/mean_terminated_length": 1528.634033203125, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 0.017133956386292833, "grad_norm": 0.5159066915512085, "kl": 0.0009192087745759636, "learning_rate": 2e-06, "loss": 0.0883, "num_tokens": 1491351.0, "reward": 1.4014300107955933, "reward_std": 0.18983161449432373, "rewards/contains_chinese/mean": 0.9523809552192688, "rewards/contains_chinese/std": 0.21423791348934174, "rewards/correct_reward_func/mean": 0.44904908537864685, "rewards/correct_reward_func/std": 0.16164183616638184, "step": 11 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2740.0, "completions/max_terminated_length": 2740.0, "completions/mean_length": 1522.107177734375, "completions/mean_terminated_length": 1522.107177734375, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.018691588785046728, "grad_norm": 0.5924640893936157, "kl": 0.0011004244443029165, "learning_rate": 1.999375e-06, "loss": -0.0201, "num_tokens": 1625304.0, "reward": 1.3764175176620483, "reward_std": 0.11236605048179626, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.388322114944458, "rewards/correct_reward_func/std": 0.14457714557647705, "step": 12 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2378.0, "completions/mean_length": 1624.9285888671875, "completions/mean_terminated_length": 1464.756103515625, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "epoch": 0.020249221183800622, "grad_norm": 0.5270693302154541, "kl": 0.0012032188242301345, "learning_rate": 1.99875e-06, "loss": 0.0802, "num_tokens": 1767936.0, "reward": 1.3871831893920898, "reward_std": 0.13983462750911713, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.3990878164768219, "rewards/correct_reward_func/std": 0.1606336236000061, "step": 13 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 1479.9761962890625, "completions/mean_terminated_length": 1479.9761962890625, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.021806853582554516, "grad_norm": 0.6190818548202515, "kl": 0.0014357012696564198, "learning_rate": 1.998125e-06, "loss": -0.0216, "num_tokens": 1898164.0, "reward": 1.3895180225372314, "reward_std": 0.08238209784030914, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.38951802253723145, "rewards/correct_reward_func/std": 0.11832733452320099, "step": 14 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 1475.0, "completions/mean_terminated_length": 1394.072265625, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.02336448598130841, "grad_norm": 0.6059337258338928, "kl": 0.0016431952244602144, "learning_rate": 1.9975e-06, "loss": 0.0526, "num_tokens": 2027914.0, "reward": 1.3913025856018066, "reward_std": 0.1798313409090042, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.4270167648792267, "rewards/correct_reward_func/std": 0.15607501566410065, "step": 15 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1945.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 1430.6785888671875, "completions/mean_terminated_length": 1430.6785888671875, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 0.024922118380062305, "grad_norm": 0.5683785676956177, "kl": 0.0018854692461900413, "learning_rate": 1.996875e-06, "loss": 0.0171, "num_tokens": 2153959.0, "reward": 1.3746126890182495, "reward_std": 0.11688078194856644, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3746126592159271, "rewards/correct_reward_func/std": 0.16250161826610565, "step": 16 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2076.0, "completions/max_terminated_length": 2076.0, "completions/mean_length": 1465.607177734375, "completions/mean_terminated_length": 1465.607177734375, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.0264797507788162, "grad_norm": 0.5914926528930664, "kl": 0.002100524492561817, "learning_rate": 1.99625e-06, "loss": 0.0093, "num_tokens": 2283034.0, "reward": 1.3505299091339111, "reward_std": 0.10699693858623505, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.35052984952926636, "rewards/correct_reward_func/std": 0.13587050139904022, "step": 17 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2083.0, "completions/max_terminated_length": 2083.0, "completions/mean_length": 1521.297607421875, "completions/mean_terminated_length": 1521.297607421875, "completions/min_length": 1084.0, "completions/min_terminated_length": 1084.0, "epoch": 0.028037383177570093, "grad_norm": 0.5633271336555481, "kl": 0.002299150452017784, "learning_rate": 1.995625e-06, "loss": 0.0163, "num_tokens": 2416793.0, "reward": 1.3485947847366333, "reward_std": 0.12012340128421783, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3485947251319885, "rewards/correct_reward_func/std": 0.15519553422927856, "step": 18 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2358.0, "completions/max_terminated_length": 2358.0, "completions/mean_length": 1465.702392578125, "completions/mean_terminated_length": 1465.702392578125, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.029595015576323987, "grad_norm": 0.5992788076400757, "kl": 0.002645128988660872, "learning_rate": 1.995e-06, "loss": 0.0093, "num_tokens": 2545768.0, "reward": 1.4050683975219727, "reward_std": 0.09077386558055878, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4050683081150055, "rewards/correct_reward_func/std": 0.1320529729127884, "step": 19 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 1532.8929443359375, "completions/mean_terminated_length": 1532.8929443359375, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "epoch": 0.03115264797507788, "grad_norm": 0.5546616315841675, "kl": 0.00307619187515229, "learning_rate": 1.994375e-06, "loss": 0.0061, "num_tokens": 2680495.0, "reward": 1.4120489358901978, "reward_std": 0.0814485251903534, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41204896569252014, "rewards/correct_reward_func/std": 0.14482632279396057, "step": 20 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2494.0, "completions/mean_length": 1707.9285888671875, "completions/mean_terminated_length": 1549.7803955078125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.03271028037383177, "grad_norm": 0.588342010974884, "kl": 0.0031734263757243752, "learning_rate": 1.9937499999999998e-06, "loss": 0.0697, "num_tokens": 2830129.0, "reward": 1.336440920829773, "reward_std": 0.10719747841358185, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3364408016204834, "rewards/correct_reward_func/std": 0.1317695528268814, "step": 21 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2244.0, "completions/max_terminated_length": 2244.0, "completions/mean_length": 1420.0238037109375, "completions/mean_terminated_length": 1420.0238037109375, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "epoch": 0.03426791277258567, "grad_norm": 0.5817105770111084, "kl": 0.003916586167179048, "learning_rate": 1.993125e-06, "loss": -0.0456, "num_tokens": 2955267.0, "reward": 1.4114601612091064, "reward_std": 0.1481117159128189, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.43526971340179443, "rewards/correct_reward_func/std": 0.13317571580410004, "step": 22 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2074.0, "completions/max_terminated_length": 2074.0, "completions/mean_length": 1391.2738037109375, "completions/mean_terminated_length": 1391.2738037109375, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "epoch": 0.03582554517133956, "grad_norm": 0.6308074593544006, "kl": 0.004312207689508796, "learning_rate": 1.9925e-06, "loss": -0.0213, "num_tokens": 3078014.0, "reward": 1.37629234790802, "reward_std": 0.16423028707504272, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4001017212867737, "rewards/correct_reward_func/std": 0.18709857761859894, "step": 23 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2132.0, "completions/max_terminated_length": 2132.0, "completions/mean_length": 1513.857177734375, "completions/mean_terminated_length": 1513.857177734375, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "epoch": 0.037383177570093455, "grad_norm": 0.5661871433258057, "kl": 0.004699907032772899, "learning_rate": 1.991875e-06, "loss": -0.0204, "num_tokens": 3211100.0, "reward": 1.384656310081482, "reward_std": 0.06906478852033615, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.38465628027915955, "rewards/correct_reward_func/std": 0.13333608210086823, "step": 24 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 1553.4761962890625, "completions/mean_terminated_length": 1553.4761962890625, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 0.03894080996884735, "grad_norm": 0.5596578121185303, "kl": 0.0051078200340271, "learning_rate": 1.9912499999999998e-06, "loss": -0.0025, "num_tokens": 3347538.0, "reward": 1.4165700674057007, "reward_std": 0.0908581092953682, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4165700376033783, "rewards/correct_reward_func/std": 0.11516361683607101, "step": 25 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 1607.5357666015625, "completions/mean_terminated_length": 1528.2047119140625, "completions/min_length": 533.0, "completions/min_terminated_length": 533.0, "epoch": 0.040498442367601244, "grad_norm": 0.5141485929489136, "kl": 0.005483957007527351, "learning_rate": 1.990625e-06, "loss": 0.0479, "num_tokens": 3488835.0, "reward": 1.3931559324264526, "reward_std": 0.09568320959806442, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3931559920310974, "rewards/correct_reward_func/std": 0.15160411596298218, "step": 26 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2595.0, "completions/max_terminated_length": 2595.0, "completions/mean_length": 1509.1429443359375, "completions/mean_terminated_length": 1509.1429443359375, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "epoch": 0.04205607476635514, "grad_norm": 0.5826243758201599, "kl": 0.006127089960500598, "learning_rate": 1.99e-06, "loss": 0.0232, "num_tokens": 3621663.0, "reward": 1.3912872076034546, "reward_std": 0.09357985109090805, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3912872076034546, "rewards/correct_reward_func/std": 0.12481305748224258, "step": 27 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2237.0, "completions/max_terminated_length": 2237.0, "completions/mean_length": 1489.3214111328125, "completions/mean_terminated_length": 1489.3214111328125, "completions/min_length": 939.0, "completions/min_terminated_length": 939.0, "epoch": 0.04361370716510903, "grad_norm": 0.5792966485023499, "kl": 0.006385253742337227, "learning_rate": 1.989375e-06, "loss": -0.0165, "num_tokens": 3752922.0, "reward": 1.353344440460205, "reward_std": 0.1214955672621727, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.3771539032459259, "rewards/correct_reward_func/std": 0.1341404765844345, "step": 28 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2873.0, "completions/max_terminated_length": 2873.0, "completions/mean_length": 1522.3095703125, "completions/mean_terminated_length": 1522.3095703125, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "epoch": 0.045171339563862926, "grad_norm": 0.566626250743866, "kl": 0.0068822442553937435, "learning_rate": 1.98875e-06, "loss": 0.0097, "num_tokens": 3886802.0, "reward": 1.4724082946777344, "reward_std": 0.11441156268119812, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4843129515647888, "rewards/correct_reward_func/std": 0.1651531606912613, "step": 29 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 1525.607177734375, "completions/mean_terminated_length": 1525.607177734375, "completions/min_length": 770.0, "completions/min_terminated_length": 770.0, "epoch": 0.04672897196261682, "grad_norm": 0.5457088351249695, "kl": 0.0072290110401809216, "learning_rate": 1.9881249999999997e-06, "loss": -0.0188, "num_tokens": 4021133.0, "reward": 1.4517107009887695, "reward_std": 0.07869784533977509, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4517105519771576, "rewards/correct_reward_func/std": 0.1555166095495224, "step": 30 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2269.0, "completions/max_terminated_length": 2269.0, "completions/mean_length": 1493.107177734375, "completions/mean_terminated_length": 1493.107177734375, "completions/min_length": 962.0, "completions/min_terminated_length": 962.0, "epoch": 0.048286604361370715, "grad_norm": 0.6338136792182922, "kl": 0.00765396817587316, "learning_rate": 1.9875e-06, "loss": -0.0171, "num_tokens": 4152536.0, "reward": 1.4097148180007935, "reward_std": 0.06910388171672821, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.409714937210083, "rewards/correct_reward_func/std": 0.15830738842487335, "step": 31 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2282.0, "completions/max_terminated_length": 2282.0, "completions/mean_length": 1465.21435546875, "completions/mean_terminated_length": 1465.21435546875, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 0.04984423676012461, "grad_norm": 0.5526667833328247, "kl": 0.008096857462078333, "learning_rate": 1.986875e-06, "loss": -0.0181, "num_tokens": 4281500.0, "reward": 1.4042376279830933, "reward_std": 0.13532030582427979, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4280470907688141, "rewards/correct_reward_func/std": 0.1270482838153839, "step": 32 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2617.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 1565.357177734375, "completions/mean_terminated_length": 1565.357177734375, "completions/min_length": 1090.0, "completions/min_terminated_length": 1090.0, "epoch": 0.0514018691588785, "grad_norm": 0.5518661737442017, "kl": 0.008292545564472675, "learning_rate": 1.98625e-06, "loss": -0.0018, "num_tokens": 4419188.0, "reward": 1.4700257778167725, "reward_std": 0.07613833993673325, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4700256884098053, "rewards/correct_reward_func/std": 0.1587969958782196, "step": 33 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2546.0, "completions/max_terminated_length": 2546.0, "completions/mean_length": 1508.3333740234375, "completions/mean_terminated_length": 1508.3333740234375, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "epoch": 0.0529595015576324, "grad_norm": 0.5799688696861267, "kl": 0.008962879423052073, "learning_rate": 1.9856249999999997e-06, "loss": 0.0352, "num_tokens": 4551942.0, "reward": 1.3886348009109497, "reward_std": 0.0952007845044136, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4005395472049713, "rewards/correct_reward_func/std": 0.13069912791252136, "step": 34 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2638.0, "completions/mean_length": 1659.6429443359375, "completions/mean_terminated_length": 1580.939697265625, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.05451713395638629, "grad_norm": 0.5250554084777832, "kl": 0.008917136583477259, "learning_rate": 1.985e-06, "loss": 0.0202, "num_tokens": 4697496.0, "reward": 1.4027281999588013, "reward_std": 0.1092085987329483, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.41463297605514526, "rewards/correct_reward_func/std": 0.15449127554893494, "step": 35 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2422.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 1497.416748046875, "completions/mean_terminated_length": 1497.416748046875, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 0.056074766355140186, "grad_norm": 0.5780891180038452, "kl": 0.009273746516555548, "learning_rate": 1.984375e-06, "loss": -0.0039, "num_tokens": 4829153.0, "reward": 1.447800636291504, "reward_std": 0.09562971442937851, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44780054688453674, "rewards/correct_reward_func/std": 0.14727683365345, "step": 36 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 1602.2261962890625, "completions/mean_terminated_length": 1522.831298828125, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.05763239875389408, "grad_norm": 0.544447660446167, "kl": 0.009292236994951963, "learning_rate": 1.98375e-06, "loss": 0.0476, "num_tokens": 4969716.0, "reward": 1.3862570524215698, "reward_std": 0.13318565487861633, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.39816194772720337, "rewards/correct_reward_func/std": 0.13715338706970215, "step": 37 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 1596.1429443359375, "completions/mean_terminated_length": 1596.1429443359375, "completions/min_length": 1108.0, "completions/min_terminated_length": 1108.0, "epoch": 0.059190031152647975, "grad_norm": 0.5399186015129089, "kl": 0.009696871042251587, "learning_rate": 1.9831249999999998e-06, "loss": 0.0039, "num_tokens": 5109858.0, "reward": 1.450761318206787, "reward_std": 0.07588593661785126, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4507613480091095, "rewards/correct_reward_func/std": 0.14506648480892181, "step": 38 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2544.0, "completions/max_terminated_length": 2544.0, "completions/mean_length": 1563.5833740234375, "completions/mean_terminated_length": 1563.5833740234375, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 0.06074766355140187, "grad_norm": 0.5695660710334778, "kl": 0.01048774877563119, "learning_rate": 1.9824999999999997e-06, "loss": -0.0184, "num_tokens": 5247163.0, "reward": 1.468957781791687, "reward_std": 0.11620029807090759, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48086267709732056, "rewards/correct_reward_func/std": 0.15909142792224884, "step": 39 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2598.0, "completions/max_terminated_length": 2598.0, "completions/mean_length": 1592.40478515625, "completions/mean_terminated_length": 1592.40478515625, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.06230529595015576, "grad_norm": 0.5112860798835754, "kl": 0.01074655307456851, "learning_rate": 1.981875e-06, "loss": 0.0015, "num_tokens": 5387021.0, "reward": 1.4456558227539062, "reward_std": 0.11054416000843048, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45756059885025024, "rewards/correct_reward_func/std": 0.1503782570362091, "step": 40 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 1634.6190185546875, "completions/mean_terminated_length": 1555.6143798828125, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 0.06386292834890965, "grad_norm": 0.5349351167678833, "kl": 0.010826343204826117, "learning_rate": 1.98125e-06, "loss": 0.0725, "num_tokens": 5530521.0, "reward": 1.4252383708953857, "reward_std": 0.11318045854568481, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43714308738708496, "rewards/correct_reward_func/std": 0.1361854374408722, "step": 41 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2310.0, "completions/max_terminated_length": 2310.0, "completions/mean_length": 1527.047607421875, "completions/mean_terminated_length": 1527.047607421875, "completions/min_length": 43.0, "completions/min_terminated_length": 43.0, "epoch": 0.06542056074766354, "grad_norm": 0.582492470741272, "kl": 0.010992726311087608, "learning_rate": 1.980625e-06, "loss": -0.0331, "num_tokens": 5664793.0, "reward": 1.4851031303405762, "reward_std": 0.1116347685456276, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4851030707359314, "rewards/correct_reward_func/std": 0.18202589452266693, "step": 42 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2985.0, "completions/max_terminated_length": 2985.0, "completions/mean_length": 1511.59521484375, "completions/mean_terminated_length": 1511.59521484375, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.06697819314641744, "grad_norm": 0.5876683592796326, "kl": 0.011393898166716099, "learning_rate": 1.98e-06, "loss": 0.0116, "num_tokens": 5797743.0, "reward": 1.3936251401901245, "reward_std": 0.08762513846158981, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4055299162864685, "rewards/correct_reward_func/std": 0.15156783163547516, "step": 43 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2907.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 1573.21435546875, "completions/mean_terminated_length": 1573.21435546875, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "epoch": 0.06853582554517133, "grad_norm": 0.586552619934082, "kl": 0.012315568514168262, "learning_rate": 1.979375e-06, "loss": 0.0067, "num_tokens": 5935767.0, "reward": 1.3731769323349, "reward_std": 0.09234315901994705, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.3850816488265991, "rewards/correct_reward_func/std": 0.1175212487578392, "step": 44 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2610.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 1540.416748046875, "completions/mean_terminated_length": 1540.416748046875, "completions/min_length": 1049.0, "completions/min_terminated_length": 1049.0, "epoch": 0.07009345794392523, "grad_norm": 0.5170222520828247, "kl": 0.012232929933816195, "learning_rate": 1.97875e-06, "loss": -0.0046, "num_tokens": 6071102.0, "reward": 1.4366557598114014, "reward_std": 0.05740538239479065, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4366556704044342, "rewards/correct_reward_func/std": 0.12168268114328384, "step": 45 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2226.0, "completions/max_terminated_length": 2226.0, "completions/mean_length": 1525.0, "completions/mean_terminated_length": 1525.0, "completions/min_length": 940.0, "completions/min_terminated_length": 940.0, "epoch": 0.07165109034267912, "grad_norm": 0.5328289270401001, "kl": 0.01250599604099989, "learning_rate": 1.978125e-06, "loss": 0.001, "num_tokens": 6205328.0, "reward": 1.4486056566238403, "reward_std": 0.08073987811803818, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4605104327201843, "rewards/correct_reward_func/std": 0.1921333372592926, "step": 46 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2231.0, "completions/max_terminated_length": 2231.0, "completions/mean_length": 1511.3095703125, "completions/mean_terminated_length": 1511.3095703125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.07320872274143302, "grad_norm": 0.5911400318145752, "kl": 0.012701177038252354, "learning_rate": 1.9775e-06, "loss": -0.0073, "num_tokens": 6338398.0, "reward": 1.4118802547454834, "reward_std": 0.09744904190301895, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.42378488183021545, "rewards/correct_reward_func/std": 0.15915460884571075, "step": 47 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2418.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 1445.1785888671875, "completions/mean_terminated_length": 1445.1785888671875, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.07476635514018691, "grad_norm": 0.5825939774513245, "kl": 0.014202028047293425, "learning_rate": 1.976875e-06, "loss": 0.0291, "num_tokens": 6465733.0, "reward": 1.4184983968734741, "reward_std": 0.07337880879640579, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41849830746650696, "rewards/correct_reward_func/std": 0.12052969634532928, "step": 48 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 1505.0595703125, "completions/mean_terminated_length": 1505.0595703125, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.0763239875389408, "grad_norm": 0.5948581099510193, "kl": 0.013798453379422426, "learning_rate": 1.97625e-06, "loss": -0.0075, "num_tokens": 6598212.0, "reward": 1.4806807041168213, "reward_std": 0.07690379023551941, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48068052530288696, "rewards/correct_reward_func/std": 0.208627387881279, "step": 49 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 1519.84521484375, "completions/mean_terminated_length": 1439.457763671875, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 0.0778816199376947, "grad_norm": 0.577416181564331, "kl": 0.013814115896821022, "learning_rate": 1.975625e-06, "loss": 0.0698, "num_tokens": 6731633.0, "reward": 1.3718026876449585, "reward_std": 0.08667115122079849, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.37180256843566895, "rewards/correct_reward_func/std": 0.1464298814535141, "step": 50 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2822.0, "completions/max_terminated_length": 2822.0, "completions/mean_length": 1481.9881591796875, "completions/mean_terminated_length": 1481.9881591796875, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 0.0794392523364486, "grad_norm": 0.5731110572814941, "kl": 0.014519122894853354, "learning_rate": 1.975e-06, "loss": 0.0268, "num_tokens": 6861940.0, "reward": 1.4318668842315674, "reward_std": 0.10813824832439423, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.443771630525589, "rewards/correct_reward_func/std": 0.155172199010849, "step": 51 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2002.0, "completions/max_terminated_length": 2002.0, "completions/mean_length": 1394.15478515625, "completions/mean_terminated_length": 1394.15478515625, "completions/min_length": 670.0, "completions/min_terminated_length": 670.0, "epoch": 0.08099688473520249, "grad_norm": 0.6160910129547119, "kl": 0.01515409117564559, "learning_rate": 1.974375e-06, "loss": -0.0362, "num_tokens": 6984959.0, "reward": 1.4249346256256104, "reward_std": 0.06116212159395218, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.424934446811676, "rewards/correct_reward_func/std": 0.15084582567214966, "step": 52 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 1428.0238037109375, "completions/mean_terminated_length": 1428.0238037109375, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.08255451713395638, "grad_norm": 0.6062555909156799, "kl": 0.015089603140950203, "learning_rate": 1.97375e-06, "loss": 0.0005, "num_tokens": 7110697.0, "reward": 1.427535891532898, "reward_std": 0.11901802569627762, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4394405484199524, "rewards/correct_reward_func/std": 0.17434334754943848, "step": 53 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 1549.46435546875, "completions/mean_terminated_length": 1469.4337158203125, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 0.08411214953271028, "grad_norm": 0.5520058274269104, "kl": 0.014234152156859636, "learning_rate": 1.973125e-06, "loss": 0.0763, "num_tokens": 7246990.0, "reward": 1.5137581825256348, "reward_std": 0.0792492926120758, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5137581825256348, "rewards/correct_reward_func/std": 0.1610475480556488, "step": 54 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1853.0, "completions/max_terminated_length": 1853.0, "completions/mean_length": 1293.4405517578125, "completions/mean_terminated_length": 1293.4405517578125, "completions/min_length": 20.0, "completions/min_terminated_length": 20.0, "epoch": 0.08566978193146417, "grad_norm": 0.605660617351532, "kl": 0.01542581431567669, "learning_rate": 1.9724999999999997e-06, "loss": -0.038, "num_tokens": 7361321.0, "reward": 1.4857640266418457, "reward_std": 0.10166757553815842, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4857640564441681, "rewards/correct_reward_func/std": 0.16004827618598938, "step": 55 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 1419.96435546875, "completions/mean_terminated_length": 1419.96435546875, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.08722741433021806, "grad_norm": 0.5911623239517212, "kl": 0.015617348719388247, "learning_rate": 1.971875e-06, "loss": 0.0036, "num_tokens": 7486292.0, "reward": 1.376123309135437, "reward_std": 0.09990442544221878, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.38802799582481384, "rewards/correct_reward_func/std": 0.12811584770679474, "step": 56 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2756.0, "completions/max_terminated_length": 2756.0, "completions/mean_length": 1440.84521484375, "completions/mean_terminated_length": 1440.84521484375, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "epoch": 0.08878504672897196, "grad_norm": 0.5922889113426208, "kl": 0.016300208866596222, "learning_rate": 1.97125e-06, "loss": 0.0113, "num_tokens": 7613335.0, "reward": 1.4130823612213135, "reward_std": 0.1000562384724617, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.42498698830604553, "rewards/correct_reward_func/std": 0.11995380371809006, "step": 57 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1405.3214111328125, "completions/mean_terminated_length": 1405.3214111328125, "completions/min_length": 637.0, "completions/min_terminated_length": 637.0, "epoch": 0.09034267912772585, "grad_norm": 0.5695552825927734, "kl": 0.015375382732599974, "learning_rate": 1.970625e-06, "loss": 0.0023, "num_tokens": 7737406.0, "reward": 1.4474685192108154, "reward_std": 0.12695710361003876, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4712778627872467, "rewards/correct_reward_func/std": 0.16331063210964203, "step": 58 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6591.0, "completions/max_terminated_length": 6591.0, "completions/mean_length": 1408.65478515625, "completions/mean_terminated_length": 1408.65478515625, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "epoch": 0.09190031152647975, "grad_norm": 0.5550790429115295, "kl": 0.015237292740494013, "learning_rate": 1.9699999999999998e-06, "loss": -0.0213, "num_tokens": 7861643.0, "reward": 1.4678882360458374, "reward_std": 0.09509699046611786, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46788811683654785, "rewards/correct_reward_func/std": 0.1579650342464447, "step": 59 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1376.84521484375, "completions/mean_terminated_length": 1376.84521484375, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 0.09345794392523364, "grad_norm": 0.5696773529052734, "kl": 0.01649821363389492, "learning_rate": 1.969375e-06, "loss": -0.0025, "num_tokens": 7983274.0, "reward": 1.4070838689804077, "reward_std": 0.061898693442344666, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4070839285850525, "rewards/correct_reward_func/std": 0.1115923598408699, "step": 60 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2341.0, "completions/max_terminated_length": 2341.0, "completions/mean_length": 1368.8214111328125, "completions/mean_terminated_length": 1368.8214111328125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.09501557632398754, "grad_norm": 0.6040320992469788, "kl": 0.017192344181239605, "learning_rate": 1.96875e-06, "loss": 0.0044, "num_tokens": 8104279.0, "reward": 1.503865361213684, "reward_std": 0.10960246622562408, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5157700777053833, "rewards/correct_reward_func/std": 0.17495499551296234, "step": 61 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1887.0, "completions/max_terminated_length": 1887.0, "completions/mean_length": 1308.5, "completions/mean_terminated_length": 1308.5, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "epoch": 0.09657320872274143, "grad_norm": 0.6219011545181274, "kl": 0.017216363921761513, "learning_rate": 1.968125e-06, "loss": 0.0128, "num_tokens": 8219905.0, "reward": 1.4492701292037964, "reward_std": 0.0713193342089653, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4492699205875397, "rewards/correct_reward_func/std": 0.179514080286026, "step": 62 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1914.0, "completions/max_terminated_length": 1914.0, "completions/mean_length": 1346.261962890625, "completions/mean_terminated_length": 1346.261962890625, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 0.09813084112149532, "grad_norm": 0.5801587104797363, "kl": 0.01690333615988493, "learning_rate": 1.9675e-06, "loss": 0.0077, "num_tokens": 8338913.0, "reward": 1.456557273864746, "reward_std": 0.11657059192657471, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46846190094947815, "rewards/correct_reward_func/std": 0.1239844486117363, "step": 63 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2429.0, "completions/max_terminated_length": 2429.0, "completions/mean_length": 1346.7381591796875, "completions/mean_terminated_length": 1346.7381591796875, "completions/min_length": 531.0, "completions/min_terminated_length": 531.0, "epoch": 0.09968847352024922, "grad_norm": 0.6113296747207642, "kl": 0.017923656851053238, "learning_rate": 1.9668749999999997e-06, "loss": -0.0187, "num_tokens": 8458009.0, "reward": 1.448889136314392, "reward_std": 0.07096786797046661, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4488890469074249, "rewards/correct_reward_func/std": 0.15353722870349884, "step": 64 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2639.0, "completions/max_terminated_length": 2639.0, "completions/mean_length": 1402.4761962890625, "completions/mean_terminated_length": 1402.4761962890625, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 0.10124610591900311, "grad_norm": 0.5536694526672363, "kl": 0.017508030869066715, "learning_rate": 1.96625e-06, "loss": -0.0024, "num_tokens": 8581955.0, "reward": 1.4375591278076172, "reward_std": 0.12363146990537643, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4613686203956604, "rewards/correct_reward_func/std": 0.16242319345474243, "step": 65 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2007.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 1408.5357666015625, "completions/mean_terminated_length": 1408.5357666015625, "completions/min_length": 745.0, "completions/min_terminated_length": 745.0, "epoch": 0.102803738317757, "grad_norm": 0.5778936743736267, "kl": 0.018688876181840897, "learning_rate": 1.965625e-06, "loss": 0.0091, "num_tokens": 8706158.0, "reward": 1.4179571866989136, "reward_std": 0.08643031865358353, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41795703768730164, "rewards/correct_reward_func/std": 0.14004966616630554, "step": 66 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2824.0, "completions/max_terminated_length": 2824.0, "completions/mean_length": 1323.7857666015625, "completions/mean_terminated_length": 1323.7857666015625, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.1043613707165109, "grad_norm": 0.6043643355369568, "kl": 0.01887867320328951, "learning_rate": 1.965e-06, "loss": -0.0489, "num_tokens": 8823284.0, "reward": 1.4494088888168335, "reward_std": 0.1078440248966217, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4613136053085327, "rewards/correct_reward_func/std": 0.16942912340164185, "step": 67 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1987.0, "completions/max_terminated_length": 1987.0, "completions/mean_length": 1402.0714111328125, "completions/mean_terminated_length": 1402.0714111328125, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 0.1059190031152648, "grad_norm": 0.5758241415023804, "kl": 0.01884887833148241, "learning_rate": 1.9643749999999997e-06, "loss": 0.0126, "num_tokens": 8947046.0, "reward": 1.4188536405563354, "reward_std": 0.12322477996349335, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4426631033420563, "rewards/correct_reward_func/std": 0.137950137257576, "step": 68 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1983.0, "completions/max_terminated_length": 1983.0, "completions/mean_length": 1305.75, "completions/mean_terminated_length": 1305.75, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 0.10747663551401869, "grad_norm": 0.634208083152771, "kl": 0.01928142551332712, "learning_rate": 1.96375e-06, "loss": 0.0068, "num_tokens": 9062675.0, "reward": 1.4428762197494507, "reward_std": 0.0850701555609703, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44287601113319397, "rewards/correct_reward_func/std": 0.13382165133953094, "step": 69 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1879.0, "completions/max_terminated_length": 1879.0, "completions/mean_length": 1331.6190185546875, "completions/mean_terminated_length": 1331.6190185546875, "completions/min_length": 442.0, "completions/min_terminated_length": 442.0, "epoch": 0.10903426791277258, "grad_norm": 0.5981489419937134, "kl": 0.019094611518085003, "learning_rate": 1.963125e-06, "loss": 0.0136, "num_tokens": 9180489.0, "reward": 1.4353286027908325, "reward_std": 0.1428486853837967, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.47104281187057495, "rewards/correct_reward_func/std": 0.14202405512332916, "step": 70 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1905.0, "completions/mean_length": 1393.84521484375, "completions/mean_terminated_length": 1311.939697265625, "completions/min_length": 384.0, "completions/min_terminated_length": 384.0, "epoch": 0.11059190031152648, "grad_norm": 0.6035619378089905, "kl": 0.018923446536064148, "learning_rate": 1.9625e-06, "loss": 0.0341, "num_tokens": 9303716.0, "reward": 1.4664435386657715, "reward_std": 0.09690098464488983, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4783483147621155, "rewards/correct_reward_func/std": 0.16768568754196167, "step": 71 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2748.0, "completions/max_terminated_length": 2748.0, "completions/mean_length": 1333.5833740234375, "completions/mean_terminated_length": 1333.5833740234375, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.11214953271028037, "grad_norm": 0.6100907325744629, "kl": 0.020233074203133583, "learning_rate": 1.9618749999999997e-06, "loss": 0.0087, "num_tokens": 9421647.0, "reward": 1.5223532915115356, "reward_std": 0.09191029518842697, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5342578887939453, "rewards/correct_reward_func/std": 0.14939941465854645, "step": 72 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1924.0, "completions/max_terminated_length": 1924.0, "completions/mean_length": 1286.8333740234375, "completions/mean_terminated_length": 1286.8333740234375, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.11370716510903427, "grad_norm": 0.599524736404419, "kl": 0.020186283625662327, "learning_rate": 1.9612499999999996e-06, "loss": -0.0046, "num_tokens": 9535795.0, "reward": 1.5099772214889526, "reward_std": 0.08711431175470352, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5099770426750183, "rewards/correct_reward_func/std": 0.15553654730319977, "step": 73 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2063.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 1436.34521484375, "completions/mean_terminated_length": 1436.34521484375, "completions/min_length": 1035.0, "completions/min_terminated_length": 1035.0, "epoch": 0.11526479750778816, "grad_norm": 0.5551663041114807, "kl": 0.019929789006710052, "learning_rate": 1.960625e-06, "loss": 0.0163, "num_tokens": 9662448.0, "reward": 1.4788012504577637, "reward_std": 0.06518861651420593, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4788011312484741, "rewards/correct_reward_func/std": 0.1376221776008606, "step": 74 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1884.0, "completions/max_terminated_length": 1884.0, "completions/mean_length": 1261.25, "completions/mean_terminated_length": 1261.25, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "epoch": 0.11682242990654206, "grad_norm": 0.6235907673835754, "kl": 0.020636904053390026, "learning_rate": 1.96e-06, "loss": 0.0244, "num_tokens": 9774249.0, "reward": 1.4924941062927246, "reward_std": 0.11271940171718597, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.492494136095047, "rewards/correct_reward_func/std": 0.1804288774728775, "step": 75 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2336.0, "completions/max_terminated_length": 2336.0, "completions/mean_length": 1304.4405517578125, "completions/mean_terminated_length": 1304.4405517578125, "completions/min_length": 777.0, "completions/min_terminated_length": 777.0, "epoch": 0.11838006230529595, "grad_norm": 0.5913470983505249, "kl": 0.021659635938704014, "learning_rate": 1.959375e-06, "loss": -0.0261, "num_tokens": 9889840.0, "reward": 1.5048737525939941, "reward_std": 0.07548126578330994, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5048737525939941, "rewards/correct_reward_func/std": 0.15308529138565063, "step": 76 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2124.0, "completions/max_terminated_length": 2124.0, "completions/mean_length": 1349.297607421875, "completions/mean_terminated_length": 1349.297607421875, "completions/min_length": 344.0, "completions/min_terminated_length": 344.0, "epoch": 0.11993769470404984, "grad_norm": 0.6202085018157959, "kl": 0.0223425030708313, "learning_rate": 1.95875e-06, "loss": -0.0086, "num_tokens": 10009103.0, "reward": 1.4445719718933105, "reward_std": 0.07185468822717667, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44457200169563293, "rewards/correct_reward_func/std": 0.134343683719635, "step": 77 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2239.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 1348.916748046875, "completions/mean_terminated_length": 1348.916748046875, "completions/min_length": 868.0, "completions/min_terminated_length": 868.0, "epoch": 0.12149532710280374, "grad_norm": 0.5950252413749695, "kl": 0.022254208102822304, "learning_rate": 1.958125e-06, "loss": -0.0042, "num_tokens": 10128292.0, "reward": 1.423844575881958, "reward_std": 0.1012512668967247, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43574920296669006, "rewards/correct_reward_func/std": 0.14782190322875977, "step": 78 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2228.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 1300.7738037109375, "completions/mean_terminated_length": 1300.7738037109375, "completions/min_length": 50.0, "completions/min_terminated_length": 50.0, "epoch": 0.12305295950155763, "grad_norm": 0.6564156413078308, "kl": 0.022449446842074394, "learning_rate": 1.9575e-06, "loss": -0.0301, "num_tokens": 10243383.0, "reward": 1.4560483694076538, "reward_std": 0.09143143892288208, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45604828000068665, "rewards/correct_reward_func/std": 0.18612980842590332, "step": 79 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 1269.9761962890625, "completions/mean_terminated_length": 1269.9761962890625, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.12461059190031153, "grad_norm": 0.5963668823242188, "kl": 0.02302556298673153, "learning_rate": 1.956875e-06, "loss": 0.0111, "num_tokens": 10355959.0, "reward": 1.4893500804901123, "reward_std": 0.060889869928359985, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48934999108314514, "rewards/correct_reward_func/std": 0.17626559734344482, "step": 80 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2109.0, "completions/max_terminated_length": 2109.0, "completions/mean_length": 1332.357177734375, "completions/mean_terminated_length": 1332.357177734375, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "epoch": 0.1261682242990654, "grad_norm": 0.579901397228241, "kl": 0.023500431329011917, "learning_rate": 1.95625e-06, "loss": -0.0165, "num_tokens": 10473763.0, "reward": 1.4350974559783936, "reward_std": 0.09430722892284393, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.435097336769104, "rewards/correct_reward_func/std": 0.17246632277965546, "step": 81 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2049.0, "completions/max_terminated_length": 2049.0, "completions/mean_length": 1330.65478515625, "completions/mean_terminated_length": 1330.65478515625, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 0.1277258566978193, "grad_norm": 0.6202122569084167, "kl": 0.02360576204955578, "learning_rate": 1.955625e-06, "loss": -0.0197, "num_tokens": 10591580.0, "reward": 1.4157038927078247, "reward_std": 0.10765408724546432, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4276086390018463, "rewards/correct_reward_func/std": 0.12850892543792725, "step": 82 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 1427.7738037109375, "completions/mean_terminated_length": 1346.277099609375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 0.1292834890965732, "grad_norm": 0.6136038899421692, "kl": 0.023545796051621437, "learning_rate": 1.955e-06, "loss": 0.0744, "num_tokens": 10717513.0, "reward": 1.4062758684158325, "reward_std": 0.1044679582118988, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4181804656982422, "rewards/correct_reward_func/std": 0.12793242931365967, "step": 83 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1828.0, "completions/max_terminated_length": 1828.0, "completions/mean_length": 1328.46435546875, "completions/mean_terminated_length": 1328.46435546875, "completions/min_length": 757.0, "completions/min_terminated_length": 757.0, "epoch": 0.1308411214953271, "grad_norm": 0.6154831051826477, "kl": 0.024212509393692017, "learning_rate": 1.954375e-06, "loss": -0.0025, "num_tokens": 10835026.0, "reward": 1.4841961860656738, "reward_std": 0.08680614829063416, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48419615626335144, "rewards/correct_reward_func/std": 0.18122969567775726, "step": 84 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 1370.1905517578125, "completions/mean_terminated_length": 1370.1905517578125, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 0.13239875389408098, "grad_norm": 0.5834784507751465, "kl": 0.02425501774996519, "learning_rate": 1.95375e-06, "loss": 0.0084, "num_tokens": 10956050.0, "reward": 1.4767500162124634, "reward_std": 0.0894772931933403, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4767499566078186, "rewards/correct_reward_func/std": 0.17486557364463806, "step": 85 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 1371.547607421875, "completions/mean_terminated_length": 1371.547607421875, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 0.13395638629283488, "grad_norm": 0.5802989602088928, "kl": 0.024927244521677494, "learning_rate": 1.953125e-06, "loss": 0.015, "num_tokens": 11077188.0, "reward": 1.4462058544158936, "reward_std": 0.07399098575115204, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4462057650089264, "rewards/correct_reward_func/std": 0.12212073057889938, "step": 86 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3011.0, "completions/max_terminated_length": 3011.0, "completions/mean_length": 1386.9405517578125, "completions/mean_terminated_length": 1386.9405517578125, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 0.13551401869158877, "grad_norm": 0.5793676376342773, "kl": 0.024698903784155846, "learning_rate": 1.9525e-06, "loss": -0.0095, "num_tokens": 11199721.0, "reward": 1.4632221460342407, "reward_std": 0.11924762278795242, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47512686252593994, "rewards/correct_reward_func/std": 0.20697803795337677, "step": 87 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1388.047607421875, "completions/mean_terminated_length": 1388.047607421875, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "epoch": 0.13707165109034267, "grad_norm": 0.5643482804298401, "kl": 0.02437182515859604, "learning_rate": 1.951875e-06, "loss": -0.0021, "num_tokens": 11322299.0, "reward": 1.4515498876571655, "reward_std": 0.08298921585083008, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45154979825019836, "rewards/correct_reward_func/std": 0.13497759401798248, "step": 88 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 1484.107177734375, "completions/mean_terminated_length": 1403.2890625, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 0.13862928348909656, "grad_norm": 0.5531883239746094, "kl": 0.023836837150156498, "learning_rate": 1.9512499999999997e-06, "loss": 0.0771, "num_tokens": 11453042.0, "reward": 1.4721571207046509, "reward_std": 0.07365047186613083, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4721570611000061, "rewards/correct_reward_func/std": 0.17172464728355408, "step": 89 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1920.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 1399.9881591796875, "completions/mean_terminated_length": 1399.9881591796875, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 0.14018691588785046, "grad_norm": 0.5798895359039307, "kl": 0.024346785619854927, "learning_rate": 1.950625e-06, "loss": 0.0128, "num_tokens": 11576779.0, "reward": 1.4762535095214844, "reward_std": 0.07557668536901474, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4762535095214844, "rewards/correct_reward_func/std": 0.11668115109205246, "step": 90 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2492.0, "completions/max_terminated_length": 2492.0, "completions/mean_length": 1411.7261962890625, "completions/mean_terminated_length": 1411.7261962890625, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 0.14174454828660435, "grad_norm": 0.5864601731300354, "kl": 0.025230017490684986, "learning_rate": 1.95e-06, "loss": -0.0058, "num_tokens": 11701376.0, "reward": 1.43711256980896, "reward_std": 0.06144386902451515, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4371124505996704, "rewards/correct_reward_func/std": 0.11439383029937744, "step": 91 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3613.0, "completions/max_terminated_length": 3613.0, "completions/mean_length": 1334.6190185546875, "completions/mean_terminated_length": 1334.6190185546875, "completions/min_length": 904.0, "completions/min_terminated_length": 904.0, "epoch": 0.14330218068535824, "grad_norm": 0.607031524181366, "kl": 0.026643778197467327, "learning_rate": 1.949375e-06, "loss": 0.0048, "num_tokens": 11819346.0, "reward": 1.4904909133911133, "reward_std": 0.0712086409330368, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4904908537864685, "rewards/correct_reward_func/std": 0.11422253400087357, "step": 92 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 1322.666748046875, "completions/mean_terminated_length": 1322.666748046875, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.14485981308411214, "grad_norm": 0.6401565074920654, "kl": 0.024812299758195877, "learning_rate": 1.9487499999999998e-06, "loss": -0.0045, "num_tokens": 11936330.0, "reward": 1.3906474113464355, "reward_std": 0.07590549439191818, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4025520086288452, "rewards/correct_reward_func/std": 0.16384169459342957, "step": 93 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3187.0, "completions/max_terminated_length": 3187.0, "completions/mean_length": 1393.202392578125, "completions/mean_terminated_length": 1393.202392578125, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 0.14641744548286603, "grad_norm": 0.6122695803642273, "kl": 0.025386733002960682, "learning_rate": 1.948125e-06, "loss": 0.0086, "num_tokens": 12059407.0, "reward": 1.562843680381775, "reward_std": 0.11002606898546219, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5747482776641846, "rewards/correct_reward_func/std": 0.1576448678970337, "step": 94 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2248.0, "completions/max_terminated_length": 2248.0, "completions/mean_length": 1397.547607421875, "completions/mean_terminated_length": 1397.547607421875, "completions/min_length": 299.0, "completions/min_terminated_length": 299.0, "epoch": 0.14797507788161993, "grad_norm": 0.620469868183136, "kl": 0.02531202882528305, "learning_rate": 1.9475e-06, "loss": -0.0123, "num_tokens": 12182867.0, "reward": 1.3950475454330444, "reward_std": 0.11134982109069824, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4069521725177765, "rewards/correct_reward_func/std": 0.14087031781673431, "step": 95 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2121.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 1391.297607421875, "completions/mean_terminated_length": 1391.297607421875, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 0.14953271028037382, "grad_norm": 0.5656567215919495, "kl": 0.025920305401086807, "learning_rate": 1.946875e-06, "loss": 0.0145, "num_tokens": 12305718.0, "reward": 1.447014570236206, "reward_std": 0.08030132949352264, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44701454043388367, "rewards/correct_reward_func/std": 0.14291325211524963, "step": 96 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1768.0, "completions/max_terminated_length": 1768.0, "completions/mean_length": 1303.34521484375, "completions/mean_terminated_length": 1303.34521484375, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "epoch": 0.15109034267912771, "grad_norm": 0.6328920125961304, "kl": 0.026815838180482388, "learning_rate": 1.94625e-06, "loss": 0.0115, "num_tokens": 12421073.0, "reward": 1.4320292472839355, "reward_std": 0.07025571167469025, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4320293068885803, "rewards/correct_reward_func/std": 0.14837507903575897, "step": 97 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2359.0, "completions/max_terminated_length": 2359.0, "completions/mean_length": 1374.34521484375, "completions/mean_terminated_length": 1374.34521484375, "completions/min_length": 532.0, "completions/min_terminated_length": 532.0, "epoch": 0.1526479750778816, "grad_norm": 0.564152181148529, "kl": 0.025968145579099655, "learning_rate": 1.9456249999999997e-06, "loss": -0.0224, "num_tokens": 12542284.0, "reward": 1.3898595571517944, "reward_std": 0.11127079278230667, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.41366904973983765, "rewards/correct_reward_func/std": 0.18172919750213623, "step": 98 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1444.761962890625, "completions/mean_terminated_length": 1444.761962890625, "completions/min_length": 925.0, "completions/min_terminated_length": 925.0, "epoch": 0.1542056074766355, "grad_norm": 0.5919306874275208, "kl": 0.02604432962834835, "learning_rate": 1.945e-06, "loss": 0.028, "num_tokens": 12669842.0, "reward": 1.4752211570739746, "reward_std": 0.07153313606977463, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47522109746932983, "rewards/correct_reward_func/std": 0.1358029991388321, "step": 99 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2159.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 1375.2857666015625, "completions/mean_terminated_length": 1375.2857666015625, "completions/min_length": 656.0, "completions/min_terminated_length": 656.0, "epoch": 0.1557632398753894, "grad_norm": 0.6110028624534607, "kl": 0.02663259394466877, "learning_rate": 1.944375e-06, "loss": 0.0359, "num_tokens": 12791516.0, "reward": 1.4065624475479126, "reward_std": 0.07423868775367737, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4065624475479126, "rewards/correct_reward_func/std": 0.1365046501159668, "step": 100 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2102.0, "completions/max_terminated_length": 2102.0, "completions/mean_length": 1338.75, "completions/mean_terminated_length": 1338.75, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.1573208722741433, "grad_norm": 0.6527758240699768, "kl": 0.026112915948033333, "learning_rate": 1.94375e-06, "loss": 0.0353, "num_tokens": 12909851.0, "reward": 1.4908164739608765, "reward_std": 0.11687764525413513, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5027210712432861, "rewards/correct_reward_func/std": 0.15153582394123077, "step": 101 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2034.0, "completions/max_terminated_length": 2034.0, "completions/mean_length": 1339.357177734375, "completions/mean_terminated_length": 1339.357177734375, "completions/min_length": 820.0, "completions/min_terminated_length": 820.0, "epoch": 0.1588785046728972, "grad_norm": 0.6052369475364685, "kl": 0.026593846268951893, "learning_rate": 1.9431249999999997e-06, "loss": -0.0369, "num_tokens": 13028333.0, "reward": 1.443188190460205, "reward_std": 0.06765951961278915, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4431880712509155, "rewards/correct_reward_func/std": 0.12718868255615234, "step": 102 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 1414.84521484375, "completions/mean_terminated_length": 1333.1927490234375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.16043613707165108, "grad_norm": 0.5841978788375854, "kl": 0.027741556987166405, "learning_rate": 1.9424999999999996e-06, "loss": 0.0426, "num_tokens": 13153132.0, "reward": 1.458450198173523, "reward_std": 0.10668490082025528, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4584501385688782, "rewards/correct_reward_func/std": 0.17332585155963898, "step": 103 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 1408.5238037109375, "completions/mean_terminated_length": 1408.5238037109375, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.16199376947040497, "grad_norm": 0.574649453163147, "kl": 0.026361594907939434, "learning_rate": 1.941875e-06, "loss": 0.0063, "num_tokens": 13277502.0, "reward": 1.4935458898544312, "reward_std": 0.06738085299730301, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.493545800447464, "rewards/correct_reward_func/std": 0.17171715199947357, "step": 104 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2340.0, "completions/max_terminated_length": 2340.0, "completions/mean_length": 1397.2261962890625, "completions/mean_terminated_length": 1397.2261962890625, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 0.16355140186915887, "grad_norm": 0.6107763648033142, "kl": 0.028758167289197445, "learning_rate": 1.94125e-06, "loss": -0.0053, "num_tokens": 13401001.0, "reward": 1.4977682828903198, "reward_std": 0.07337197661399841, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49776825308799744, "rewards/correct_reward_func/std": 0.16519995033740997, "step": 105 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1723.0, "completions/max_terminated_length": 1723.0, "completions/mean_length": 1302.9405517578125, "completions/mean_terminated_length": 1302.9405517578125, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 0.16510903426791276, "grad_norm": 0.5974235534667969, "kl": 0.02774975076317787, "learning_rate": 1.940625e-06, "loss": 0.0079, "num_tokens": 13516292.0, "reward": 1.500797152519226, "reward_std": 0.07562069594860077, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5007970333099365, "rewards/correct_reward_func/std": 0.1385519951581955, "step": 106 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1333.3214111328125, "completions/mean_terminated_length": 1333.3214111328125, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 0.16666666666666666, "grad_norm": 0.5899143218994141, "kl": 0.02739392127841711, "learning_rate": 1.94e-06, "loss": -0.0042, "num_tokens": 13634147.0, "reward": 1.3661153316497803, "reward_std": 0.1458723396062851, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.4018295407295227, "rewards/correct_reward_func/std": 0.14165017008781433, "step": 107 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 1340.2261962890625, "completions/mean_terminated_length": 1340.2261962890625, "completions/min_length": 833.0, "completions/min_terminated_length": 833.0, "epoch": 0.16822429906542055, "grad_norm": 0.6039218902587891, "kl": 0.027482734993100166, "learning_rate": 1.939375e-06, "loss": 0.0001, "num_tokens": 13752750.0, "reward": 1.4691303968429565, "reward_std": 0.07967161387205124, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.469130277633667, "rewards/correct_reward_func/std": 0.17357668280601501, "step": 108 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2010.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 1375.047607421875, "completions/mean_terminated_length": 1375.047607421875, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 0.16978193146417445, "grad_norm": 0.575854480266571, "kl": 0.029047698713839054, "learning_rate": 1.93875e-06, "loss": -0.0041, "num_tokens": 13874344.0, "reward": 1.4227412939071655, "reward_std": 0.07607690989971161, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42274120450019836, "rewards/correct_reward_func/std": 0.12703333795070648, "step": 109 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2687.0, "completions/mean_length": 1464.4285888671875, "completions/mean_terminated_length": 1383.3734130859375, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 0.17133956386292834, "grad_norm": 0.5789852738380432, "kl": 0.02678022440522909, "learning_rate": 1.938125e-06, "loss": 0.0679, "num_tokens": 14003362.0, "reward": 1.4550813436508179, "reward_std": 0.09208068251609802, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4550813138484955, "rewards/correct_reward_func/std": 0.12678542733192444, "step": 110 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1335.797607421875, "completions/mean_terminated_length": 1335.797607421875, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 0.17289719626168223, "grad_norm": 0.5807675719261169, "kl": 0.027965486980974674, "learning_rate": 1.9375e-06, "loss": 0.0215, "num_tokens": 14121389.0, "reward": 1.4275166988372803, "reward_std": 0.09834519028663635, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43942132592201233, "rewards/correct_reward_func/std": 0.17403005063533783, "step": 111 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2137.0, "completions/max_terminated_length": 2137.0, "completions/mean_length": 1329.2738037109375, "completions/mean_terminated_length": 1329.2738037109375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 0.17445482866043613, "grad_norm": 0.6370654702186584, "kl": 0.03011655993759632, "learning_rate": 1.936875e-06, "loss": -0.0064, "num_tokens": 14239240.0, "reward": 1.4735430479049683, "reward_std": 0.07655790448188782, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4735429883003235, "rewards/correct_reward_func/std": 0.1377311646938324, "step": 112 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1904.0, "completions/max_terminated_length": 1904.0, "completions/mean_length": 1350.702392578125, "completions/mean_terminated_length": 1350.702392578125, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.17601246105919002, "grad_norm": 0.5633478164672852, "kl": 0.027842647396028042, "learning_rate": 1.93625e-06, "loss": 0.0033, "num_tokens": 14358717.0, "reward": 1.4617105722427368, "reward_std": 0.07503892481327057, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46171048283576965, "rewards/correct_reward_func/std": 0.11019705981016159, "step": 113 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1946.0, "completions/max_terminated_length": 1946.0, "completions/mean_length": 1320.6785888671875, "completions/mean_terminated_length": 1320.6785888671875, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 0.17757009345794392, "grad_norm": 0.622870683670044, "kl": 0.02942818123847246, "learning_rate": 1.9356249999999998e-06, "loss": -0.0117, "num_tokens": 14475876.0, "reward": 1.4551728963851929, "reward_std": 0.07278000563383102, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4551727771759033, "rewards/correct_reward_func/std": 0.12961725890636444, "step": 114 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 1367.547607421875, "completions/mean_terminated_length": 1367.547607421875, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "epoch": 0.1791277258566978, "grad_norm": 0.5723932385444641, "kl": 0.027969708666205406, "learning_rate": 1.935e-06, "loss": 0.0019, "num_tokens": 14596690.0, "reward": 1.424649715423584, "reward_std": 0.10459105670452118, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4365543723106384, "rewards/correct_reward_func/std": 0.13125936686992645, "step": 115 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2112.0, "completions/mean_length": 1496.8690185546875, "completions/mean_terminated_length": 1416.2047119140625, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 0.1806853582554517, "grad_norm": 0.5445118546485901, "kl": 0.027951962314546108, "learning_rate": 1.934375e-06, "loss": 0.0382, "num_tokens": 14728457.0, "reward": 1.4374322891235352, "reward_std": 0.08895470947027206, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4374321401119232, "rewards/correct_reward_func/std": 0.1454283893108368, "step": 116 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2093.0, "completions/max_terminated_length": 2093.0, "completions/mean_length": 1406.547607421875, "completions/mean_terminated_length": 1406.547607421875, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "epoch": 0.1822429906542056, "grad_norm": 0.5892264246940613, "kl": 0.02943518850952387, "learning_rate": 1.93375e-06, "loss": 0.0251, "num_tokens": 14852511.0, "reward": 1.4491535425186157, "reward_std": 0.07768179476261139, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44915345311164856, "rewards/correct_reward_func/std": 0.14331160485744476, "step": 117 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2191.0, "completions/max_terminated_length": 2191.0, "completions/mean_length": 1454.1429443359375, "completions/mean_terminated_length": 1454.1429443359375, "completions/min_length": 991.0, "completions/min_terminated_length": 991.0, "epoch": 0.1838006230529595, "grad_norm": 0.5593940615653992, "kl": 0.028669409453868866, "learning_rate": 1.933125e-06, "loss": -0.0092, "num_tokens": 14980683.0, "reward": 1.4328858852386475, "reward_std": 0.06446022540330887, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4328858554363251, "rewards/correct_reward_func/std": 0.15252062678337097, "step": 118 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1976.0, "completions/max_terminated_length": 1976.0, "completions/mean_length": 1330.416748046875, "completions/mean_terminated_length": 1330.416748046875, "completions/min_length": 564.0, "completions/min_terminated_length": 564.0, "epoch": 0.1853582554517134, "grad_norm": 0.6204725503921509, "kl": 0.031036019325256348, "learning_rate": 1.9325e-06, "loss": 0.0538, "num_tokens": 15098204.0, "reward": 1.4865161180496216, "reward_std": 0.06775263696908951, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4865160584449768, "rewards/correct_reward_func/std": 0.1286322921514511, "step": 119 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2474.0, "completions/max_terminated_length": 2474.0, "completions/mean_length": 1421.0714111328125, "completions/mean_terminated_length": 1421.0714111328125, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.18691588785046728, "grad_norm": 0.6097022294998169, "kl": 0.028833536431193352, "learning_rate": 1.931875e-06, "loss": -0.0063, "num_tokens": 15223664.0, "reward": 1.4651082754135132, "reward_std": 0.08360718935728073, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4770130515098572, "rewards/correct_reward_func/std": 0.15723736584186554, "step": 120 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 1469.916748046875, "completions/mean_terminated_length": 1469.916748046875, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.18847352024922118, "grad_norm": 0.5351011157035828, "kl": 0.028757021762430668, "learning_rate": 1.93125e-06, "loss": 0.0368, "num_tokens": 15353281.0, "reward": 1.4508754014968872, "reward_std": 0.06538330763578415, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4508753716945648, "rewards/correct_reward_func/std": 0.14440658688545227, "step": 121 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2518.0, "completions/max_terminated_length": 2518.0, "completions/mean_length": 1432.6190185546875, "completions/mean_terminated_length": 1432.6190185546875, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.19003115264797507, "grad_norm": 0.5838773846626282, "kl": 0.030190047807991505, "learning_rate": 1.930625e-06, "loss": 0.0004, "num_tokens": 15479627.0, "reward": 1.5679670572280884, "reward_std": 0.08373278379440308, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5679671168327332, "rewards/correct_reward_func/std": 0.17479771375656128, "step": 122 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2098.0, "completions/max_terminated_length": 2098.0, "completions/mean_length": 1383.1905517578125, "completions/mean_terminated_length": 1383.1905517578125, "completions/min_length": 873.0, "completions/min_terminated_length": 873.0, "epoch": 0.19158878504672897, "grad_norm": 0.603692889213562, "kl": 0.03224192373454571, "learning_rate": 1.9299999999999997e-06, "loss": 0.0088, "num_tokens": 15601791.0, "reward": 1.4391270875930786, "reward_std": 0.06994114071130753, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4391269087791443, "rewards/correct_reward_func/std": 0.14909610152244568, "step": 123 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2679.0, "completions/max_terminated_length": 2679.0, "completions/mean_length": 1490.2381591796875, "completions/mean_terminated_length": 1490.2381591796875, "completions/min_length": 1037.0, "completions/min_terminated_length": 1037.0, "epoch": 0.19314641744548286, "grad_norm": 0.5941579937934875, "kl": 0.02839325089007616, "learning_rate": 1.929375e-06, "loss": 0.0225, "num_tokens": 15733049.0, "reward": 1.415550947189331, "reward_std": 0.06161380559206009, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41555076837539673, "rewards/correct_reward_func/std": 0.10922452807426453, "step": 124 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2151.0, "completions/max_terminated_length": 2151.0, "completions/mean_length": 1322.2857666015625, "completions/mean_terminated_length": 1322.2857666015625, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 0.19470404984423675, "grad_norm": 0.5985201001167297, "kl": 0.029462194070219994, "learning_rate": 1.92875e-06, "loss": -0.0175, "num_tokens": 15849953.0, "reward": 1.4787646532058716, "reward_std": 0.09507114440202713, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47876468300819397, "rewards/correct_reward_func/std": 0.1848842203617096, "step": 125 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 1334.8333740234375, "completions/mean_terminated_length": 1334.8333740234375, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "epoch": 0.19626168224299065, "grad_norm": 0.6323754191398621, "kl": 0.02924549486488104, "learning_rate": 1.928125e-06, "loss": -0.0014, "num_tokens": 15967953.0, "reward": 1.520105242729187, "reward_std": 0.07988641411066055, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5201051831245422, "rewards/correct_reward_func/std": 0.16170603036880493, "step": 126 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2265.0, "completions/max_terminated_length": 2265.0, "completions/mean_length": 1414.9881591796875, "completions/mean_terminated_length": 1414.9881591796875, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 0.19781931464174454, "grad_norm": 0.5964637994766235, "kl": 0.029463034123182297, "learning_rate": 1.9274999999999998e-06, "loss": 0.0118, "num_tokens": 16092890.0, "reward": 1.4794553518295288, "reward_std": 0.06303998827934265, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47945523262023926, "rewards/correct_reward_func/std": 0.11690139025449753, "step": 127 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2203.0, "completions/mean_length": 1490.8214111328125, "completions/mean_terminated_length": 1410.084228515625, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 0.19937694704049844, "grad_norm": 0.5852746963500977, "kl": 0.029941502027213573, "learning_rate": 1.9268749999999997e-06, "loss": 0.0568, "num_tokens": 16224011.0, "reward": 1.4570320844650269, "reward_std": 0.13305586576461792, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4808415472507477, "rewards/correct_reward_func/std": 0.15189340710639954, "step": 128 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 1419.9285888671875, "completions/mean_terminated_length": 1419.9285888671875, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 0.20093457943925233, "grad_norm": 0.6678198575973511, "kl": 0.02869417704641819, "learning_rate": 1.92625e-06, "loss": -0.0245, "num_tokens": 16349273.0, "reward": 1.4819056987762451, "reward_std": 0.0900546982884407, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4819056987762451, "rewards/correct_reward_func/std": 0.13743773102760315, "step": 129 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 1410.511962890625, "completions/mean_terminated_length": 1410.511962890625, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.20249221183800623, "grad_norm": 0.5829043984413147, "kl": 0.029658552259206772, "learning_rate": 1.925625e-06, "loss": 0.0304, "num_tokens": 16473606.0, "reward": 1.4438934326171875, "reward_std": 0.08044224977493286, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4438934326171875, "rewards/correct_reward_func/std": 0.15281730890274048, "step": 130 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 1419.6190185546875, "completions/mean_terminated_length": 1419.6190185546875, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 0.20404984423676012, "grad_norm": 0.6060424447059631, "kl": 0.029810849577188492, "learning_rate": 1.9249999999999998e-06, "loss": 0.0186, "num_tokens": 16598698.0, "reward": 1.4231759309768677, "reward_std": 0.13214969635009766, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.4588901996612549, "rewards/correct_reward_func/std": 0.17215143144130707, "step": 131 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2134.0, "completions/mean_length": 1587.047607421875, "completions/mean_terminated_length": 1425.951171875, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "epoch": 0.205607476635514, "grad_norm": 0.5848340392112732, "kl": 0.02779593039304018, "learning_rate": 1.9243749999999997e-06, "loss": 0.1231, "num_tokens": 16738082.0, "reward": 1.418549656867981, "reward_std": 0.09781080484390259, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41854962706565857, "rewards/correct_reward_func/std": 0.15713582932949066, "step": 132 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2101.0, "completions/max_terminated_length": 2101.0, "completions/mean_length": 1363.107177734375, "completions/mean_terminated_length": 1363.107177734375, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 0.2071651090342679, "grad_norm": 0.6095062494277954, "kl": 0.02982013951987028, "learning_rate": 1.92375e-06, "loss": -0.0032, "num_tokens": 16858487.0, "reward": 1.4797476530075073, "reward_std": 0.08453521132469177, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47974759340286255, "rewards/correct_reward_func/std": 0.13973869383335114, "step": 133 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2277.0, "completions/max_terminated_length": 2277.0, "completions/mean_length": 1384.3095703125, "completions/mean_terminated_length": 1384.3095703125, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.2087227414330218, "grad_norm": 0.5872117280960083, "kl": 0.03033105470240116, "learning_rate": 1.923125e-06, "loss": -0.0099, "num_tokens": 16980787.0, "reward": 1.4834345579147339, "reward_std": 0.0603872612118721, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48343440890312195, "rewards/correct_reward_func/std": 0.16034552454948425, "step": 134 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2117.0, "completions/max_terminated_length": 2117.0, "completions/mean_length": 1370.40478515625, "completions/mean_terminated_length": 1370.40478515625, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.2102803738317757, "grad_norm": 0.6103830337524414, "kl": 0.030320947989821434, "learning_rate": 1.9225e-06, "loss": 0.0061, "num_tokens": 17101811.0, "reward": 1.4273531436920166, "reward_std": 0.09265647828578949, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4392578601837158, "rewards/correct_reward_func/std": 0.13740864396095276, "step": 135 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1993.0, "completions/max_terminated_length": 1993.0, "completions/mean_length": 1434.46435546875, "completions/mean_terminated_length": 1434.46435546875, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "epoch": 0.2118380062305296, "grad_norm": 0.603561520576477, "kl": 0.02980469260364771, "learning_rate": 1.9218749999999997e-06, "loss": -0.0064, "num_tokens": 17228618.0, "reward": 1.4576891660690308, "reward_std": 0.05491868779063225, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45768895745277405, "rewards/correct_reward_func/std": 0.14544495940208435, "step": 136 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2719.0, "completions/max_terminated_length": 2719.0, "completions/mean_length": 1493.416748046875, "completions/mean_terminated_length": 1493.416748046875, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.21339563862928349, "grad_norm": 0.5915752053260803, "kl": 0.028933603316545486, "learning_rate": 1.9212499999999996e-06, "loss": -0.0056, "num_tokens": 17360071.0, "reward": 1.5237936973571777, "reward_std": 0.07290388643741608, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5237936973571777, "rewards/correct_reward_func/std": 0.14304442703723907, "step": 137 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 1456.1785888671875, "completions/mean_terminated_length": 1456.1785888671875, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.21495327102803738, "grad_norm": 0.5867950916290283, "kl": 0.030623883940279484, "learning_rate": 1.920625e-06, "loss": 0.0369, "num_tokens": 17488402.0, "reward": 1.424317717552185, "reward_std": 0.09510175883769989, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4362224042415619, "rewards/correct_reward_func/std": 0.15234197676181793, "step": 138 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7913.0, "completions/max_terminated_length": 7913.0, "completions/mean_length": 1533.6190185546875, "completions/mean_terminated_length": 1533.6190185546875, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.21651090342679127, "grad_norm": 0.5414514541625977, "kl": 0.02811363060027361, "learning_rate": 1.92e-06, "loss": 0.0618, "num_tokens": 17623412.0, "reward": 1.491133689880371, "reward_std": 0.06729433685541153, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49113360047340393, "rewards/correct_reward_func/std": 0.17636097967624664, "step": 139 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1416.011962890625, "completions/mean_terminated_length": 1416.011962890625, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 0.21806853582554517, "grad_norm": 0.5571795701980591, "kl": 0.03138226270675659, "learning_rate": 1.919375e-06, "loss": 0.0457, "num_tokens": 17748435.0, "reward": 1.4378496408462524, "reward_std": 0.07887466251850128, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4378494918346405, "rewards/correct_reward_func/std": 0.10594429075717926, "step": 140 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1325.5833740234375, "completions/mean_terminated_length": 1325.5833740234375, "completions/min_length": 374.0, "completions/min_terminated_length": 374.0, "epoch": 0.21962616822429906, "grad_norm": 0.5585833191871643, "kl": 0.03183058649301529, "learning_rate": 1.91875e-06, "loss": -0.0038, "num_tokens": 17865712.0, "reward": 1.5206776857376099, "reward_std": 0.089789979159832, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5206776261329651, "rewards/correct_reward_func/std": 0.18472737073898315, "step": 141 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1899.0, "completions/max_terminated_length": 1899.0, "completions/mean_length": 1388.392822265625, "completions/mean_terminated_length": 1388.392822265625, "completions/min_length": 950.0, "completions/min_terminated_length": 950.0, "epoch": 0.22118380062305296, "grad_norm": 0.5522568225860596, "kl": 0.030762070789933205, "learning_rate": 1.918125e-06, "loss": 0.0044, "num_tokens": 17988241.0, "reward": 1.4582120180130005, "reward_std": 0.13537071645259857, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.49392637610435486, "rewards/correct_reward_func/std": 0.1673257201910019, "step": 142 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2187.0, "completions/max_terminated_length": 2187.0, "completions/mean_length": 1382.2857666015625, "completions/mean_terminated_length": 1382.2857666015625, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 0.22274143302180685, "grad_norm": 0.6352323889732361, "kl": 0.03129299636930227, "learning_rate": 1.9175e-06, "loss": 0.0264, "num_tokens": 18110221.0, "reward": 1.4690262079238892, "reward_std": 0.0818016454577446, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.469026118516922, "rewards/correct_reward_func/std": 0.152619868516922, "step": 143 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2218.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 1394.166748046875, "completions/mean_terminated_length": 1394.166748046875, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 0.22429906542056074, "grad_norm": 0.63814777135849, "kl": 0.03126653470098972, "learning_rate": 1.916875e-06, "loss": 0.0072, "num_tokens": 18233295.0, "reward": 1.5044426918029785, "reward_std": 0.10564389079809189, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5163474082946777, "rewards/correct_reward_func/std": 0.15176692605018616, "step": 144 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1836.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 1409.261962890625, "completions/mean_terminated_length": 1409.261962890625, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.22585669781931464, "grad_norm": 0.6800863146781921, "kl": 0.03138407226651907, "learning_rate": 1.91625e-06, "loss": 0.0101, "num_tokens": 18357601.0, "reward": 1.3754723072052002, "reward_std": 0.12025143206119537, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.3992818295955658, "rewards/correct_reward_func/std": 0.1352054476737976, "step": 145 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1912.0, "completions/max_terminated_length": 1912.0, "completions/mean_length": 1371.5833740234375, "completions/mean_terminated_length": 1371.5833740234375, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 0.22741433021806853, "grad_norm": 0.5346211791038513, "kl": 0.03179503232240677, "learning_rate": 1.915625e-06, "loss": 0.0068, "num_tokens": 18478958.0, "reward": 1.4643080234527588, "reward_std": 0.05763059854507446, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46430787444114685, "rewards/correct_reward_func/std": 0.13611404597759247, "step": 146 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1347.75, "completions/mean_terminated_length": 1347.75, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 0.22897196261682243, "grad_norm": 0.6036585569381714, "kl": 0.032050169073045254, "learning_rate": 1.915e-06, "loss": 0.0138, "num_tokens": 18598259.0, "reward": 1.5019899606704712, "reward_std": 0.07086333632469177, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5019899010658264, "rewards/correct_reward_func/std": 0.12254533916711807, "step": 147 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 1349.4761962890625, "completions/mean_terminated_length": 1349.4761962890625, "completions/min_length": 539.0, "completions/min_terminated_length": 539.0, "epoch": 0.23052959501557632, "grad_norm": 0.5723428130149841, "kl": 0.033308178186416626, "learning_rate": 1.9143749999999998e-06, "loss": -0.0342, "num_tokens": 18717507.0, "reward": 1.439958095550537, "reward_std": 0.09097757190465927, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45186278223991394, "rewards/correct_reward_func/std": 0.15231125056743622, "step": 148 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1705.0, "completions/max_terminated_length": 1705.0, "completions/mean_length": 1306.0714111328125, "completions/mean_terminated_length": 1306.0714111328125, "completions/min_length": 493.0, "completions/min_terminated_length": 493.0, "epoch": 0.23208722741433022, "grad_norm": 0.6826386451721191, "kl": 0.040258824825286865, "learning_rate": 1.91375e-06, "loss": -0.0055, "num_tokens": 18833091.0, "reward": 1.432396411895752, "reward_std": 0.0788629949092865, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4323963522911072, "rewards/correct_reward_func/std": 0.1607416421175003, "step": 149 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1910.0, "completions/max_terminated_length": 1910.0, "completions/mean_length": 1339.7857666015625, "completions/mean_terminated_length": 1339.7857666015625, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.2336448598130841, "grad_norm": 0.6238492131233215, "kl": 0.033289359882473946, "learning_rate": 1.913125e-06, "loss": -0.0105, "num_tokens": 18951591.0, "reward": 1.4412423372268677, "reward_std": 0.06025463342666626, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44124215841293335, "rewards/correct_reward_func/std": 0.1178692877292633, "step": 150 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1867.0, "completions/max_terminated_length": 1867.0, "completions/mean_length": 1272.0357666015625, "completions/mean_terminated_length": 1272.0357666015625, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 0.235202492211838, "grad_norm": 0.6190460920333862, "kl": 0.034545375034213066, "learning_rate": 1.9125e-06, "loss": -0.0009, "num_tokens": 19064406.0, "reward": 1.402614951133728, "reward_std": 0.054387416690588, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.40261486172676086, "rewards/correct_reward_func/std": 0.13192394375801086, "step": 151 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6207.0, "completions/max_terminated_length": 6207.0, "completions/mean_length": 1383.2738037109375, "completions/mean_terminated_length": 1383.2738037109375, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.2367601246105919, "grad_norm": 0.6016775369644165, "kl": 0.03410913795232773, "learning_rate": 1.911875e-06, "loss": -0.0575, "num_tokens": 19186433.0, "reward": 1.465145468711853, "reward_std": 0.06573602557182312, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46514537930488586, "rewards/correct_reward_func/std": 0.13837113976478577, "step": 152 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1986.0, "completions/max_terminated_length": 1986.0, "completions/mean_length": 1315.6309814453125, "completions/mean_terminated_length": 1315.6309814453125, "completions/min_length": 607.0, "completions/min_terminated_length": 607.0, "epoch": 0.2383177570093458, "grad_norm": 0.6019495725631714, "kl": 0.03409886732697487, "learning_rate": 1.9112499999999997e-06, "loss": -0.0083, "num_tokens": 19302892.0, "reward": 1.4242923259735107, "reward_std": 0.0667426809668541, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4242922365665436, "rewards/correct_reward_func/std": 0.13553784787654877, "step": 153 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2117.0, "completions/max_terminated_length": 2117.0, "completions/mean_length": 1342.0833740234375, "completions/mean_terminated_length": 1342.0833740234375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.2398753894080997, "grad_norm": 0.5771262049674988, "kl": 0.0346537921577692, "learning_rate": 1.910625e-06, "loss": -0.0074, "num_tokens": 19421651.0, "reward": 1.4762444496154785, "reward_std": 0.08208738267421722, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4762443006038666, "rewards/correct_reward_func/std": 0.19542856514453888, "step": 154 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2065.0, "completions/max_terminated_length": 2065.0, "completions/mean_length": 1312.6905517578125, "completions/mean_terminated_length": 1312.6905517578125, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 0.24143302180685358, "grad_norm": 0.5773271918296814, "kl": 0.03536880388855934, "learning_rate": 1.91e-06, "loss": -0.0149, "num_tokens": 19537893.0, "reward": 1.4299932718276978, "reward_std": 0.06326950341463089, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.429993212223053, "rewards/correct_reward_func/std": 0.15807007253170013, "step": 155 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 1352.166748046875, "completions/mean_terminated_length": 1352.166748046875, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 0.24299065420560748, "grad_norm": 0.6289856433868408, "kl": 0.03449527733027935, "learning_rate": 1.909375e-06, "loss": -0.003, "num_tokens": 19657355.0, "reward": 1.4612387418746948, "reward_std": 0.06969407945871353, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46123871207237244, "rewards/correct_reward_func/std": 0.1171262189745903, "step": 156 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1960.0, "completions/max_terminated_length": 1960.0, "completions/mean_length": 1367.142822265625, "completions/mean_terminated_length": 1367.142822265625, "completions/min_length": 612.0, "completions/min_terminated_length": 612.0, "epoch": 0.24454828660436137, "grad_norm": 0.5991291999816895, "kl": 0.034693608060479164, "learning_rate": 1.9087499999999997e-06, "loss": 0.0271, "num_tokens": 19778057.0, "reward": 1.3796496391296387, "reward_std": 0.056341852992773056, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.3796495795249939, "rewards/correct_reward_func/std": 0.11140848696231842, "step": 157 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2143.0, "completions/max_terminated_length": 2143.0, "completions/mean_length": 1404.4881591796875, "completions/mean_terminated_length": 1404.4881591796875, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 0.24610591900311526, "grad_norm": 0.5495375990867615, "kl": 0.03542102687060833, "learning_rate": 1.908125e-06, "loss": 0.0074, "num_tokens": 19902070.0, "reward": 1.47572660446167, "reward_std": 0.10572995990514755, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.49953609704971313, "rewards/correct_reward_func/std": 0.17438319325447083, "step": 158 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2159.0, "completions/mean_length": 1494.416748046875, "completions/mean_terminated_length": 1413.7227783203125, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.24766355140186916, "grad_norm": 0.5821869373321533, "kl": 0.03435787186026573, "learning_rate": 1.9075e-06, "loss": 0.0632, "num_tokens": 20033679.0, "reward": 1.4355047941207886, "reward_std": 0.0911315381526947, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44740939140319824, "rewards/correct_reward_func/std": 0.15956489741802216, "step": 159 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2155.0, "completions/max_terminated_length": 2155.0, "completions/mean_length": 1374.7857666015625, "completions/mean_terminated_length": 1374.7857666015625, "completions/min_length": 933.0, "completions/min_terminated_length": 933.0, "epoch": 0.24922118380062305, "grad_norm": 0.6089724898338318, "kl": 0.033023279160261154, "learning_rate": 1.906875e-06, "loss": 0.0046, "num_tokens": 20155203.0, "reward": 1.4912819862365723, "reward_std": 0.05513819307088852, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4912818670272827, "rewards/correct_reward_func/std": 0.16794303059577942, "step": 160 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2273.0, "completions/max_terminated_length": 2273.0, "completions/mean_length": 1431.2381591796875, "completions/mean_terminated_length": 1431.2381591796875, "completions/min_length": 807.0, "completions/min_terminated_length": 807.0, "epoch": 0.2507788161993769, "grad_norm": 0.5501843690872192, "kl": 0.03268396854400635, "learning_rate": 1.90625e-06, "loss": 0.0046, "num_tokens": 20281463.0, "reward": 1.4660686254501343, "reward_std": 0.0724034234881401, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4660685062408447, "rewards/correct_reward_func/std": 0.16601873934268951, "step": 161 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 1548.9881591796875, "completions/mean_terminated_length": 1468.9517822265625, "completions/min_length": 942.0, "completions/min_terminated_length": 942.0, "epoch": 0.2523364485981308, "grad_norm": 0.5610131621360779, "kl": 0.03170663956552744, "learning_rate": 1.9056249999999999e-06, "loss": 0.0445, "num_tokens": 20417668.0, "reward": 1.451736569404602, "reward_std": 0.09086348861455917, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4517364501953125, "rewards/correct_reward_func/std": 0.17963163554668427, "step": 162 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1610.5, "completions/mean_terminated_length": 1449.9755859375, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "epoch": 0.2538940809968847, "grad_norm": 0.5172660946846008, "kl": 0.030543990433216095, "learning_rate": 1.905e-06, "loss": 0.1107, "num_tokens": 20558926.0, "reward": 1.4814132452011108, "reward_std": 0.09557101875543594, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4814131557941437, "rewards/correct_reward_func/std": 0.1708441823720932, "step": 163 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2249.0, "completions/max_terminated_length": 2249.0, "completions/mean_length": 1437.416748046875, "completions/mean_terminated_length": 1437.416748046875, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "epoch": 0.2554517133956386, "grad_norm": 0.5784984230995178, "kl": 0.03393215127289295, "learning_rate": 1.9043749999999999e-06, "loss": 0.0018, "num_tokens": 20685663.0, "reward": 1.5089834928512573, "reward_std": 0.08463006466627121, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5208882093429565, "rewards/correct_reward_func/std": 0.1223825141787529, "step": 164 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3265.0, "completions/max_terminated_length": 3265.0, "completions/mean_length": 1466.6190185546875, "completions/mean_terminated_length": 1466.6190185546875, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.2570093457943925, "grad_norm": 0.6114115715026855, "kl": 0.03406968712806702, "learning_rate": 1.90375e-06, "loss": 0.0058, "num_tokens": 20814859.0, "reward": 1.4296975135803223, "reward_std": 0.08567629754543304, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4296974539756775, "rewards/correct_reward_func/std": 0.12944677472114563, "step": 165 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6028.0, "completions/max_terminated_length": 6028.0, "completions/mean_length": 1430.15478515625, "completions/mean_terminated_length": 1430.15478515625, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.2585669781931464, "grad_norm": 0.6096552014350891, "kl": 0.03403060883283615, "learning_rate": 1.9031249999999999e-06, "loss": 0.0408, "num_tokens": 20940830.0, "reward": 1.4890365600585938, "reward_std": 0.05935479328036308, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4890367090702057, "rewards/correct_reward_func/std": 0.17837880551815033, "step": 166 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2530.0, "completions/mean_length": 1496.1785888671875, "completions/mean_terminated_length": 1415.5059814453125, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "epoch": 0.2601246105919003, "grad_norm": 0.5552809238433838, "kl": 0.03266907203942537, "learning_rate": 1.9025e-06, "loss": 0.0631, "num_tokens": 21072461.0, "reward": 1.4831464290618896, "reward_std": 0.07709907740354538, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48314639925956726, "rewards/correct_reward_func/std": 0.1887180060148239, "step": 167 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2289.0, "completions/max_terminated_length": 2289.0, "completions/mean_length": 1424.15478515625, "completions/mean_terminated_length": 1424.15478515625, "completions/min_length": 1038.0, "completions/min_terminated_length": 1038.0, "epoch": 0.2616822429906542, "grad_norm": 0.5698901414871216, "kl": 0.03557535447180271, "learning_rate": 1.901875e-06, "loss": 0.0045, "num_tokens": 21198000.0, "reward": 1.4246412515640259, "reward_std": 0.07475357502698898, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4246411919593811, "rewards/correct_reward_func/std": 0.12371546775102615, "step": 168 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3198.0, "completions/max_terminated_length": 3198.0, "completions/mean_length": 1557.857177734375, "completions/mean_terminated_length": 1557.857177734375, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 0.2632398753894081, "grad_norm": 0.5663595795631409, "kl": 0.03528137691318989, "learning_rate": 1.90125e-06, "loss": 0.0162, "num_tokens": 21334890.0, "reward": 1.4614077806472778, "reward_std": 0.06879691779613495, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46140775084495544, "rewards/correct_reward_func/std": 0.12173257023096085, "step": 169 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 1575.3095703125, "completions/mean_terminated_length": 1495.59033203125, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "epoch": 0.26479750778816197, "grad_norm": 0.5297430753707886, "kl": 0.03137396089732647, "learning_rate": 1.900625e-06, "loss": 0.0552, "num_tokens": 21473114.0, "reward": 1.4736964702606201, "reward_std": 0.08518790453672409, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47369638085365295, "rewards/correct_reward_func/std": 0.15848514437675476, "step": 170 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2487.0, "completions/max_terminated_length": 2487.0, "completions/mean_length": 1467.511962890625, "completions/mean_terminated_length": 1467.511962890625, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "epoch": 0.26635514018691586, "grad_norm": 0.5714218020439148, "kl": 0.03465087711811066, "learning_rate": 1.8999999999999998e-06, "loss": -0.0139, "num_tokens": 21602295.0, "reward": 1.4387915134429932, "reward_std": 0.07327855378389359, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.438791424036026, "rewards/correct_reward_func/std": 0.14280220866203308, "step": 171 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2169.0, "completions/max_terminated_length": 2169.0, "completions/mean_length": 1464.6190185546875, "completions/mean_terminated_length": 1464.6190185546875, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "epoch": 0.26791277258566976, "grad_norm": 0.5421945452690125, "kl": 0.034480318427085876, "learning_rate": 1.899375e-06, "loss": 0.0296, "num_tokens": 21731377.0, "reward": 1.4377334117889404, "reward_std": 0.0828586295247078, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4496382176876068, "rewards/correct_reward_func/std": 0.13945025205612183, "step": 172 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2290.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 1374.34521484375, "completions/mean_terminated_length": 1374.34521484375, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 0.26947040498442365, "grad_norm": 0.6028652191162109, "kl": 0.03403773531317711, "learning_rate": 1.8987499999999998e-06, "loss": -0.0284, "num_tokens": 21852672.0, "reward": 1.4843207597732544, "reward_std": 0.0630205050110817, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4843207001686096, "rewards/correct_reward_func/std": 0.14204466342926025, "step": 173 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2999.0, "completions/max_terminated_length": 2999.0, "completions/mean_length": 1529.8809814453125, "completions/mean_terminated_length": 1529.8809814453125, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "epoch": 0.27102803738317754, "grad_norm": 0.5906941890716553, "kl": 0.034525854513049126, "learning_rate": 1.898125e-06, "loss": 0.0167, "num_tokens": 21987254.0, "reward": 1.3962242603302002, "reward_std": 0.0881531834602356, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4081288278102875, "rewards/correct_reward_func/std": 0.08537304401397705, "step": 174 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3739.0, "completions/max_terminated_length": 3739.0, "completions/mean_length": 1501.166748046875, "completions/mean_terminated_length": 1501.166748046875, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 0.27258566978193144, "grad_norm": 0.5807496905326843, "kl": 0.03358875773847103, "learning_rate": 1.8974999999999998e-06, "loss": 0.0106, "num_tokens": 22119274.0, "reward": 1.4818309545516968, "reward_std": 0.08105891197919846, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4818309247493744, "rewards/correct_reward_func/std": 0.1239246129989624, "step": 175 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2070.0, "completions/max_terminated_length": 2070.0, "completions/mean_length": 1430.09521484375, "completions/mean_terminated_length": 1430.09521484375, "completions/min_length": 860.0, "completions/min_terminated_length": 860.0, "epoch": 0.27414330218068533, "grad_norm": 0.5961090922355652, "kl": 0.03483774699270725, "learning_rate": 1.896875e-06, "loss": -0.0327, "num_tokens": 22245432.0, "reward": 1.4600034952163696, "reward_std": 0.08812181651592255, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4719081223011017, "rewards/correct_reward_func/std": 0.1560893952846527, "step": 176 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 1490.65478515625, "completions/mean_terminated_length": 1409.9156494140625, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "epoch": 0.2757009345794392, "grad_norm": 0.5829291343688965, "kl": 0.033502571284770966, "learning_rate": 1.8962499999999998e-06, "loss": 0.0753, "num_tokens": 22376581.0, "reward": 1.4762822389602661, "reward_std": 0.09881778061389923, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4762822091579437, "rewards/correct_reward_func/std": 0.15524689853191376, "step": 177 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2410.0, "completions/mean_length": 1661.4405517578125, "completions/mean_terminated_length": 1502.158447265625, "completions/min_length": 1023.0, "completions/min_terminated_length": 1023.0, "epoch": 0.2772585669781931, "grad_norm": 0.49473848938941956, "kl": 0.031944600865244865, "learning_rate": 1.8956249999999997e-06, "loss": 0.1143, "num_tokens": 22522250.0, "reward": 1.4583110809326172, "reward_std": 0.07724699378013611, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4583110809326172, "rewards/correct_reward_func/std": 0.15963733196258545, "step": 178 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2345.0, "completions/max_terminated_length": 2345.0, "completions/mean_length": 1503.09521484375, "completions/mean_terminated_length": 1503.09521484375, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.278816199376947, "grad_norm": 0.5652268528938293, "kl": 0.033452507108449936, "learning_rate": 1.8949999999999999e-06, "loss": -0.0117, "num_tokens": 22654690.0, "reward": 1.539380431175232, "reward_std": 0.07203835994005203, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5393803119659424, "rewards/correct_reward_func/std": 0.15730725228786469, "step": 179 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2153.0, "completions/max_terminated_length": 2153.0, "completions/mean_length": 1424.7381591796875, "completions/mean_terminated_length": 1424.7381591796875, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "epoch": 0.2803738317757009, "grad_norm": 0.6266071796417236, "kl": 0.03748060762882233, "learning_rate": 1.8943749999999998e-06, "loss": 0.0132, "num_tokens": 22780308.0, "reward": 1.4268115758895874, "reward_std": 0.06137494370341301, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4268115758895874, "rewards/correct_reward_func/std": 0.1579686850309372, "step": 180 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2288.0, "completions/mean_length": 1513.3214111328125, "completions/mean_terminated_length": 1432.8553466796875, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "epoch": 0.2819314641744548, "grad_norm": 0.5787291526794434, "kl": 0.03337083198130131, "learning_rate": 1.8937499999999999e-06, "loss": 0.0331, "num_tokens": 22913361.0, "reward": 1.4345508813858032, "reward_std": 0.08300718665122986, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43455079197883606, "rewards/correct_reward_func/std": 0.12243600934743881, "step": 181 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2425.0, "completions/max_terminated_length": 2425.0, "completions/mean_length": 1513.0238037109375, "completions/mean_terminated_length": 1513.0238037109375, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.2834890965732087, "grad_norm": 0.5958402752876282, "kl": 0.0339033342897892, "learning_rate": 1.8931249999999998e-06, "loss": -0.0335, "num_tokens": 23046509.0, "reward": 1.4896963834762573, "reward_std": 0.074093759059906, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4896962344646454, "rewards/correct_reward_func/std": 0.18158133327960968, "step": 182 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2347.0, "completions/max_terminated_length": 2347.0, "completions/mean_length": 1505.4405517578125, "completions/mean_terminated_length": 1505.4405517578125, "completions/min_length": 1023.0, "completions/min_terminated_length": 1023.0, "epoch": 0.2850467289719626, "grad_norm": 0.5892803072929382, "kl": 0.03511413745582104, "learning_rate": 1.8924999999999999e-06, "loss": -0.0281, "num_tokens": 23179050.0, "reward": 1.4773499965667725, "reward_std": 0.06594084203243256, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4773499667644501, "rewards/correct_reward_func/std": 0.12919507920742035, "step": 183 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1522.84521484375, "completions/mean_terminated_length": 1442.493896484375, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 0.2866043613707165, "grad_norm": 0.5434836149215698, "kl": 0.034950753673911095, "learning_rate": 1.891875e-06, "loss": 0.0603, "num_tokens": 23312909.0, "reward": 1.4319921731948853, "reward_std": 0.09058649092912674, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4438968300819397, "rewards/correct_reward_func/std": 0.1126542016863823, "step": 184 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2118.0, "completions/max_terminated_length": 2118.0, "completions/mean_length": 1422.1190185546875, "completions/mean_terminated_length": 1422.1190185546875, "completions/min_length": 793.0, "completions/min_terminated_length": 793.0, "epoch": 0.2881619937694704, "grad_norm": 0.6052145957946777, "kl": 0.03563849255442619, "learning_rate": 1.89125e-06, "loss": -0.0224, "num_tokens": 23438439.0, "reward": 1.508691668510437, "reward_std": 0.06740865856409073, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5086915493011475, "rewards/correct_reward_func/std": 0.1557309627532959, "step": 185 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 1442.6905517578125, "completions/mean_terminated_length": 1442.6905517578125, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.2897196261682243, "grad_norm": 0.5826404094696045, "kl": 0.03599457070231438, "learning_rate": 1.890625e-06, "loss": 0.022, "num_tokens": 23565397.0, "reward": 1.44635009765625, "reward_std": 0.06368335336446762, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44635000824928284, "rewards/correct_reward_func/std": 0.14564184844493866, "step": 186 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2051.0, "completions/max_terminated_length": 2051.0, "completions/mean_length": 1438.2381591796875, "completions/mean_terminated_length": 1438.2381591796875, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 0.29127725856697817, "grad_norm": 0.597940981388092, "kl": 0.0359827596694231, "learning_rate": 1.89e-06, "loss": -0.0061, "num_tokens": 23691981.0, "reward": 1.4225126504898071, "reward_std": 0.07259950041770935, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4225126802921295, "rewards/correct_reward_func/std": 0.14631718397140503, "step": 187 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2586.0, "completions/max_terminated_length": 2586.0, "completions/mean_length": 1509.1785888671875, "completions/mean_terminated_length": 1509.1785888671875, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.29283489096573206, "grad_norm": 0.5778841972351074, "kl": 0.03521919064223766, "learning_rate": 1.889375e-06, "loss": -0.0273, "num_tokens": 23824830.0, "reward": 1.4619708061218262, "reward_std": 0.07388392835855484, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46197065711021423, "rewards/correct_reward_func/std": 0.13990521430969238, "step": 188 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1537.8333740234375, "completions/mean_terminated_length": 1457.66259765625, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "epoch": 0.29439252336448596, "grad_norm": 0.5850031971931458, "kl": 0.036018045619130135, "learning_rate": 1.88875e-06, "loss": 0.078, "num_tokens": 23959864.0, "reward": 1.4022135734558105, "reward_std": 0.06926076114177704, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.402213454246521, "rewards/correct_reward_func/std": 0.15882909297943115, "step": 189 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 1444.0833740234375, "completions/mean_terminated_length": 1444.0833740234375, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.29595015576323985, "grad_norm": 0.6393625736236572, "kl": 0.03580236993730068, "learning_rate": 1.888125e-06, "loss": -0.0305, "num_tokens": 24087125.0, "reward": 1.4801563024520874, "reward_std": 0.0749017521739006, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4801561236381531, "rewards/correct_reward_func/std": 0.1525711715221405, "step": 190 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2551.0, "completions/max_terminated_length": 2551.0, "completions/mean_length": 1454.8929443359375, "completions/mean_terminated_length": 1454.8929443359375, "completions/min_length": 959.0, "completions/min_terminated_length": 959.0, "epoch": 0.29750778816199375, "grad_norm": 0.6086553931236267, "kl": 0.036950401961803436, "learning_rate": 1.8875e-06, "loss": -0.0006, "num_tokens": 24215414.0, "reward": 1.4507030248641968, "reward_std": 0.07029537856578827, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4507029950618744, "rewards/correct_reward_func/std": 0.1585504561662674, "step": 191 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2119.0, "completions/max_terminated_length": 2119.0, "completions/mean_length": 1376.3690185546875, "completions/mean_terminated_length": 1376.3690185546875, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 0.29906542056074764, "grad_norm": 0.6285943984985352, "kl": 0.03829081356525421, "learning_rate": 1.886875e-06, "loss": 0.015, "num_tokens": 24336927.0, "reward": 1.422336459159851, "reward_std": 0.06646425276994705, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42233631014823914, "rewards/correct_reward_func/std": 0.12761962413787842, "step": 192 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2072.0, "completions/mean_length": 1498.59521484375, "completions/mean_terminated_length": 1417.9517822265625, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.30062305295950154, "grad_norm": 0.5861801505088806, "kl": 0.036135466769337654, "learning_rate": 1.88625e-06, "loss": 0.0616, "num_tokens": 24468491.0, "reward": 1.4807928800582886, "reward_std": 0.07590245455503464, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.480792760848999, "rewards/correct_reward_func/std": 0.18916194140911102, "step": 193 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2399.0, "completions/max_terminated_length": 2399.0, "completions/mean_length": 1527.6785888671875, "completions/mean_terminated_length": 1527.6785888671875, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.30218068535825543, "grad_norm": 0.5360192656517029, "kl": 0.03715986758470535, "learning_rate": 1.885625e-06, "loss": 0.0182, "num_tokens": 24602846.0, "reward": 1.4494075775146484, "reward_std": 0.06413312256336212, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44940757751464844, "rewards/correct_reward_func/std": 0.14744633436203003, "step": 194 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 1480.047607421875, "completions/mean_terminated_length": 1480.047607421875, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 0.3037383177570093, "grad_norm": 0.6489185094833374, "kl": 0.037878649309277534, "learning_rate": 1.885e-06, "loss": -0.0068, "num_tokens": 24733212.0, "reward": 1.4438209533691406, "reward_std": 0.05984492227435112, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44382089376449585, "rewards/correct_reward_func/std": 0.11844155192375183, "step": 195 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2308.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 1502.5714111328125, "completions/mean_terminated_length": 1502.5714111328125, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 0.3052959501557632, "grad_norm": 0.6016018390655518, "kl": 0.037760429084300995, "learning_rate": 1.8843749999999999e-06, "loss": -0.0091, "num_tokens": 24865398.0, "reward": 1.4562833309173584, "reward_std": 0.0823647603392601, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46818795800209045, "rewards/correct_reward_func/std": 0.15202713012695312, "step": 196 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2920.0, "completions/max_terminated_length": 2920.0, "completions/mean_length": 1469.3214111328125, "completions/mean_terminated_length": 1469.3214111328125, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 0.3068535825545171, "grad_norm": 0.5995698571205139, "kl": 0.03654175065457821, "learning_rate": 1.88375e-06, "loss": 0.027, "num_tokens": 24995049.0, "reward": 1.504838466644287, "reward_std": 0.07503201067447662, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5048382878303528, "rewards/correct_reward_func/std": 0.14492768049240112, "step": 197 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2287.0, "completions/max_terminated_length": 2287.0, "completions/mean_length": 1468.3214111328125, "completions/mean_terminated_length": 1468.3214111328125, "completions/min_length": 949.0, "completions/min_terminated_length": 949.0, "epoch": 0.308411214953271, "grad_norm": 0.5627298951148987, "kl": 0.03755324147641659, "learning_rate": 1.8831249999999999e-06, "loss": 0.0006, "num_tokens": 25124256.0, "reward": 1.4946554899215698, "reward_std": 0.07795637100934982, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4946552515029907, "rewards/correct_reward_func/std": 0.18591701984405518, "step": 198 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4446.0, "completions/max_terminated_length": 4446.0, "completions/mean_length": 1489.4761962890625, "completions/mean_terminated_length": 1489.4761962890625, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 0.3099688473520249, "grad_norm": 0.5919142365455627, "kl": 0.03832128271460533, "learning_rate": 1.8825e-06, "loss": 0.0226, "num_tokens": 25255240.0, "reward": 1.4337986707687378, "reward_std": 0.13976813852787018, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.457608163356781, "rewards/correct_reward_func/std": 0.16218747198581696, "step": 199 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1443.702392578125, "completions/mean_terminated_length": 1443.702392578125, "completions/min_length": 1023.0, "completions/min_terminated_length": 1023.0, "epoch": 0.3115264797507788, "grad_norm": 0.6555087566375732, "kl": 0.0495732706040144, "learning_rate": 1.8818749999999999e-06, "loss": 0.0299, "num_tokens": 25382517.0, "reward": 1.4128016233444214, "reward_std": 0.0843491479754448, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4247063994407654, "rewards/correct_reward_func/std": 0.15398363769054413, "step": 200 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2807.0, "completions/max_terminated_length": 2807.0, "completions/mean_length": 1525.0714111328125, "completions/mean_terminated_length": 1525.0714111328125, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "epoch": 0.3130841121495327, "grad_norm": 0.6016954779624939, "kl": 0.03820333816111088, "learning_rate": 1.88125e-06, "loss": 0.0094, "num_tokens": 25516815.0, "reward": 1.4183402061462402, "reward_std": 0.08215481042861938, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4421497583389282, "rewards/correct_reward_func/std": 0.12978559732437134, "step": 201 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2805.0, "completions/max_terminated_length": 2805.0, "completions/mean_length": 1495.3095703125, "completions/mean_terminated_length": 1495.3095703125, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "epoch": 0.3146417445482866, "grad_norm": 0.6088923811912537, "kl": 0.039613427594304085, "learning_rate": 1.8806249999999999e-06, "loss": 0.0236, "num_tokens": 25648445.0, "reward": 1.4182132482528687, "reward_std": 0.0829334408044815, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4301179349422455, "rewards/correct_reward_func/std": 0.14303255081176758, "step": 202 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2009.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1409.511962890625, "completions/mean_terminated_length": 1409.511962890625, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "epoch": 0.3161993769470405, "grad_norm": 0.6218343377113342, "kl": 0.03885827772319317, "learning_rate": 1.8799999999999998e-06, "loss": -0.0285, "num_tokens": 25772772.0, "reward": 1.5313540697097778, "reward_std": 0.06517814844846725, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5313540697097778, "rewards/correct_reward_func/std": 0.16049352288246155, "step": 203 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2697.0, "completions/mean_length": 1644.90478515625, "completions/mean_terminated_length": 1485.219482421875, "completions/min_length": 888.0, "completions/min_terminated_length": 888.0, "epoch": 0.3177570093457944, "grad_norm": 0.608787477016449, "kl": 0.037238216027617455, "learning_rate": 1.879375e-06, "loss": 0.1147, "num_tokens": 25916914.0, "reward": 1.4204800128936768, "reward_std": 0.1299707591533661, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4442894458770752, "rewards/correct_reward_func/std": 0.15315347909927368, "step": 204 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 1391.2857666015625, "completions/mean_terminated_length": 1391.2857666015625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.31931464174454827, "grad_norm": 0.6118940114974976, "kl": 0.03947138041257858, "learning_rate": 1.8787499999999998e-06, "loss": -0.0174, "num_tokens": 26039794.0, "reward": 1.4847838878631592, "reward_std": 0.08492320775985718, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48478370904922485, "rewards/correct_reward_func/std": 0.173141211271286, "step": 205 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 1503.46435546875, "completions/mean_terminated_length": 1503.46435546875, "completions/min_length": 1025.0, "completions/min_terminated_length": 1025.0, "epoch": 0.32087227414330216, "grad_norm": 0.5726230144500732, "kl": 0.03891279548406601, "learning_rate": 1.878125e-06, "loss": 0.0236, "num_tokens": 26172265.0, "reward": 1.4738762378692627, "reward_std": 0.06679094582796097, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4738762378692627, "rewards/correct_reward_func/std": 0.1379547268152237, "step": 206 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2086.0, "completions/max_terminated_length": 2086.0, "completions/mean_length": 1410.6785888671875, "completions/mean_terminated_length": 1410.6785888671875, "completions/min_length": 811.0, "completions/min_terminated_length": 811.0, "epoch": 0.32242990654205606, "grad_norm": 0.6023487448692322, "kl": 0.04066877439618111, "learning_rate": 1.8774999999999998e-06, "loss": -0.0013, "num_tokens": 26296666.0, "reward": 1.4893161058425903, "reward_std": 0.06180576980113983, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4893161356449127, "rewards/correct_reward_func/std": 0.13071846961975098, "step": 207 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5342.0, "completions/max_terminated_length": 5342.0, "completions/mean_length": 1561.0833740234375, "completions/mean_terminated_length": 1561.0833740234375, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.32398753894080995, "grad_norm": 0.5445609092712402, "kl": 0.03924229182302952, "learning_rate": 1.876875e-06, "loss": 0.0128, "num_tokens": 26433857.0, "reward": 1.4545553922653198, "reward_std": 0.047772545367479324, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4545552432537079, "rewards/correct_reward_func/std": 0.12593813240528107, "step": 208 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 1578.8690185546875, "completions/mean_terminated_length": 1499.1927490234375, "completions/min_length": 763.0, "completions/min_terminated_length": 763.0, "epoch": 0.32554517133956384, "grad_norm": 0.6344649791717529, "kl": 0.03905314393341541, "learning_rate": 1.8762499999999998e-06, "loss": 0.0585, "num_tokens": 26572404.0, "reward": 1.366521954536438, "reward_std": 0.1604805886745453, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.4022361636161804, "rewards/correct_reward_func/std": 0.13376381993293762, "step": 209 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2064.0, "completions/max_terminated_length": 2064.0, "completions/mean_length": 1402.7857666015625, "completions/mean_terminated_length": 1402.7857666015625, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.32710280373831774, "grad_norm": 0.633216381072998, "kl": 0.04154348373413086, "learning_rate": 1.875625e-06, "loss": -0.0066, "num_tokens": 26696082.0, "reward": 1.5711801052093506, "reward_std": 0.07012538611888885, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5711799263954163, "rewards/correct_reward_func/std": 0.11415733397006989, "step": 210 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2280.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 1505.1190185546875, "completions/mean_terminated_length": 1505.1190185546875, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 0.32866043613707163, "grad_norm": 0.5750628113746643, "kl": 0.040491702035069466, "learning_rate": 1.8749999999999998e-06, "loss": 0.0052, "num_tokens": 26828632.0, "reward": 1.4506397247314453, "reward_std": 0.07769308984279633, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46254438161849976, "rewards/correct_reward_func/std": 0.13594752550125122, "step": 211 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2156.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 1535.9761962890625, "completions/mean_terminated_length": 1535.9761962890625, "completions/min_length": 1069.0, "completions/min_terminated_length": 1069.0, "epoch": 0.3302180685358255, "grad_norm": 0.5685467720031738, "kl": 0.04025058262050152, "learning_rate": 1.8743749999999997e-06, "loss": -0.0142, "num_tokens": 26963768.0, "reward": 1.5034323930740356, "reward_std": 0.06803149729967117, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5034322142601013, "rewards/correct_reward_func/std": 0.1639764904975891, "step": 212 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2916.0, "completions/max_terminated_length": 2916.0, "completions/mean_length": 1490.21435546875, "completions/mean_terminated_length": 1490.21435546875, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "epoch": 0.3317757009345794, "grad_norm": 0.6390542387962341, "kl": 0.0426274798810482, "learning_rate": 1.8737499999999998e-06, "loss": -0.0129, "num_tokens": 27094820.0, "reward": 1.4857133626937866, "reward_std": 0.06765501946210861, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48571324348449707, "rewards/correct_reward_func/std": 0.1724708527326584, "step": 213 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2195.0, "completions/max_terminated_length": 2195.0, "completions/mean_length": 1452.416748046875, "completions/mean_terminated_length": 1452.416748046875, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "epoch": 0.3333333333333333, "grad_norm": 0.629348635673523, "kl": 0.04252097010612488, "learning_rate": 1.8731249999999997e-06, "loss": -0.0138, "num_tokens": 27222709.0, "reward": 1.4405913352966309, "reward_std": 0.05661017820239067, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44059130549430847, "rewards/correct_reward_func/std": 0.10197056829929352, "step": 214 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2295.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 1461.34521484375, "completions/mean_terminated_length": 1461.34521484375, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "epoch": 0.3348909657320872, "grad_norm": 0.5977749824523926, "kl": 0.04298440180718899, "learning_rate": 1.8725e-06, "loss": -0.0026, "num_tokens": 27351420.0, "reward": 1.505555272102356, "reward_std": 0.07077483087778091, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5055552124977112, "rewards/correct_reward_func/std": 0.19038523733615875, "step": 215 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2326.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 1527.8214111328125, "completions/mean_terminated_length": 1527.8214111328125, "completions/min_length": 815.0, "completions/min_terminated_length": 815.0, "epoch": 0.3364485981308411, "grad_norm": 0.5859149694442749, "kl": 0.043821416795253754, "learning_rate": 1.871875e-06, "loss": 0.0212, "num_tokens": 27485739.0, "reward": 1.4727665185928345, "reward_std": 0.13684409856796265, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4965760409832001, "rewards/correct_reward_func/std": 0.15358960628509521, "step": 216 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2218.0, "completions/max_terminated_length": 2218.0, "completions/mean_length": 1495.8929443359375, "completions/mean_terminated_length": 1495.8929443359375, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 0.338006230529595, "grad_norm": 0.6052335500717163, "kl": 0.04412714019417763, "learning_rate": 1.87125e-06, "loss": 0.0149, "num_tokens": 27617418.0, "reward": 1.4542310237884521, "reward_std": 0.06858990341424942, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.454230934381485, "rewards/correct_reward_func/std": 0.16202780604362488, "step": 217 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 1571.96435546875, "completions/mean_terminated_length": 1571.96435546875, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.3395638629283489, "grad_norm": 0.5771605968475342, "kl": 0.043260419741272926, "learning_rate": 1.870625e-06, "loss": -0.0301, "num_tokens": 27755499.0, "reward": 1.4477570056915283, "reward_std": 0.08448237925767899, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44775694608688354, "rewards/correct_reward_func/std": 0.15343379974365234, "step": 218 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 1507.511962890625, "completions/mean_terminated_length": 1507.511962890625, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 0.3411214953271028, "grad_norm": 0.5883904099464417, "kl": 0.044709596782922745, "learning_rate": 1.87e-06, "loss": -0.0025, "num_tokens": 27888064.0, "reward": 1.4478862285614014, "reward_std": 0.07146090269088745, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.447886198759079, "rewards/correct_reward_func/std": 0.12358597666025162, "step": 219 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 1559.8214111328125, "completions/mean_terminated_length": 1559.8214111328125, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 0.3426791277258567, "grad_norm": 0.5822417140007019, "kl": 0.04406227543950081, "learning_rate": 1.869375e-06, "loss": -0.0193, "num_tokens": 28025029.0, "reward": 1.5140615701675415, "reward_std": 0.10227732360363007, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5259661674499512, "rewards/correct_reward_func/std": 0.19070927798748016, "step": 220 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3088.0, "completions/max_terminated_length": 3088.0, "completions/mean_length": 1581.6309814453125, "completions/mean_terminated_length": 1581.6309814453125, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "epoch": 0.3442367601246106, "grad_norm": 0.6107174754142761, "kl": 0.04346361383795738, "learning_rate": 1.86875e-06, "loss": -0.0145, "num_tokens": 28163856.0, "reward": 1.5085567235946655, "reward_std": 0.08370744436979294, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5085568428039551, "rewards/correct_reward_func/std": 0.14021635055541992, "step": 221 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2224.0, "completions/max_terminated_length": 2224.0, "completions/mean_length": 1555.7261962890625, "completions/mean_terminated_length": 1555.7261962890625, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.34579439252336447, "grad_norm": 0.5877107977867126, "kl": 0.043224770575761795, "learning_rate": 1.868125e-06, "loss": -0.0078, "num_tokens": 28300597.0, "reward": 1.4819693565368652, "reward_std": 0.09113749116659164, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49387404322624207, "rewards/correct_reward_func/std": 0.19690191745758057, "step": 222 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 1568.0714111328125, "completions/mean_terminated_length": 1568.0714111328125, "completions/min_length": 577.0, "completions/min_terminated_length": 577.0, "epoch": 0.34735202492211836, "grad_norm": 0.5657203197479248, "kl": 0.04514329880475998, "learning_rate": 1.8675e-06, "loss": -0.0005, "num_tokens": 28438135.0, "reward": 1.417351484298706, "reward_std": 0.08495763689279556, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4173515737056732, "rewards/correct_reward_func/std": 0.13209807872772217, "step": 223 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3149.0, "completions/max_terminated_length": 3149.0, "completions/mean_length": 1625.6785888671875, "completions/mean_terminated_length": 1625.6785888671875, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "epoch": 0.34890965732087226, "grad_norm": 0.6052978038787842, "kl": 0.04441903904080391, "learning_rate": 1.866875e-06, "loss": -0.0046, "num_tokens": 28580818.0, "reward": 1.4290771484375, "reward_std": 0.10442011803388596, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44098177552223206, "rewards/correct_reward_func/std": 0.1553770899772644, "step": 224 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2469.0, "completions/max_terminated_length": 2469.0, "completions/mean_length": 1623.761962890625, "completions/mean_terminated_length": 1623.761962890625, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 0.35046728971962615, "grad_norm": 0.5626906156539917, "kl": 0.04204758256673813, "learning_rate": 1.86625e-06, "loss": 0.0061, "num_tokens": 28723454.0, "reward": 1.496443510055542, "reward_std": 0.06441661715507507, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49644333124160767, "rewards/correct_reward_func/std": 0.1379326730966568, "step": 225 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 1645.666748046875, "completions/mean_terminated_length": 1566.795166015625, "completions/min_length": 1062.0, "completions/min_terminated_length": 1062.0, "epoch": 0.35202492211838005, "grad_norm": 0.568545937538147, "kl": 0.041140057146549225, "learning_rate": 1.865625e-06, "loss": 0.0597, "num_tokens": 28867588.0, "reward": 1.450238585472107, "reward_std": 0.105202816426754, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4621433615684509, "rewards/correct_reward_func/std": 0.13623471558094025, "step": 226 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 1559.2261962890625, "completions/mean_terminated_length": 1559.2261962890625, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 0.35358255451713394, "grad_norm": 0.6394890546798706, "kl": 0.04375514201819897, "learning_rate": 1.865e-06, "loss": -0.0027, "num_tokens": 29004347.0, "reward": 1.4285489320755005, "reward_std": 0.04757591709494591, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42854899168014526, "rewards/correct_reward_func/std": 0.12418505549430847, "step": 227 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 1529.46435546875, "completions/mean_terminated_length": 1529.46435546875, "completions/min_length": 298.0, "completions/min_terminated_length": 298.0, "epoch": 0.35514018691588783, "grad_norm": 0.6031341552734375, "kl": 0.04356654919683933, "learning_rate": 1.8643749999999998e-06, "loss": -0.0296, "num_tokens": 29138618.0, "reward": 1.5230813026428223, "reward_std": 0.09695133566856384, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5230813026428223, "rewards/correct_reward_func/std": 0.19408383965492249, "step": 228 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2603.0, "completions/max_terminated_length": 2603.0, "completions/mean_length": 1547.21435546875, "completions/mean_terminated_length": 1547.21435546875, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "epoch": 0.35669781931464173, "grad_norm": 0.5816037654876709, "kl": 0.04396030865609646, "learning_rate": 1.86375e-06, "loss": -0.0083, "num_tokens": 29274332.0, "reward": 1.4719088077545166, "reward_std": 0.07453076541423798, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4719087481498718, "rewards/correct_reward_func/std": 0.17895452678203583, "step": 229 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6889.0, "completions/max_terminated_length": 6889.0, "completions/mean_length": 1678.75, "completions/mean_terminated_length": 1678.75, "completions/min_length": 482.0, "completions/min_terminated_length": 482.0, "epoch": 0.3582554517133956, "grad_norm": 0.5625508427619934, "kl": 0.04273660108447075, "learning_rate": 1.8631249999999998e-06, "loss": -0.0144, "num_tokens": 29421227.0, "reward": 1.4578830003738403, "reward_std": 0.06554538756608963, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45788294076919556, "rewards/correct_reward_func/std": 0.19993598759174347, "step": 230 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2497.0, "completions/mean_length": 1663.2857666015625, "completions/mean_terminated_length": 1584.62646484375, "completions/min_length": 1098.0, "completions/min_terminated_length": 1098.0, "epoch": 0.3598130841121495, "grad_norm": 0.6602963209152222, "kl": 0.042186228558421135, "learning_rate": 1.8625e-06, "loss": 0.0662, "num_tokens": 29566793.0, "reward": 1.509660005569458, "reward_std": 0.08291852474212646, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5096598267555237, "rewards/correct_reward_func/std": 0.140364408493042, "step": 231 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2407.0, "completions/max_terminated_length": 2407.0, "completions/mean_length": 1553.5, "completions/mean_terminated_length": 1553.5, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "epoch": 0.3613707165109034, "grad_norm": 0.5636409521102905, "kl": 0.0421723909676075, "learning_rate": 1.8618749999999999e-06, "loss": 0.0171, "num_tokens": 29703263.0, "reward": 1.4572166204452515, "reward_std": 0.08119600266218185, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4691213071346283, "rewards/correct_reward_func/std": 0.16932806372642517, "step": 232 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 1605.0833740234375, "completions/mean_terminated_length": 1605.0833740234375, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "epoch": 0.3629283489096573, "grad_norm": 0.5694032907485962, "kl": 0.042043108493089676, "learning_rate": 1.86125e-06, "loss": 0.0077, "num_tokens": 29844144.0, "reward": 1.4451359510421753, "reward_std": 0.05986570194363594, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44513583183288574, "rewards/correct_reward_func/std": 0.10634031891822815, "step": 233 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2301.0, "completions/max_terminated_length": 2301.0, "completions/mean_length": 1589.84521484375, "completions/mean_terminated_length": 1589.84521484375, "completions/min_length": 978.0, "completions/min_terminated_length": 978.0, "epoch": 0.3644859813084112, "grad_norm": 0.5871843695640564, "kl": 0.04350174590945244, "learning_rate": 1.8606249999999999e-06, "loss": 0.0386, "num_tokens": 29983685.0, "reward": 1.4710910320281982, "reward_std": 0.08404207974672318, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47109100222587585, "rewards/correct_reward_func/std": 0.17535626888275146, "step": 234 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2334.0, "completions/max_terminated_length": 2334.0, "completions/mean_length": 1567.0357666015625, "completions/mean_terminated_length": 1567.0357666015625, "completions/min_length": 951.0, "completions/min_terminated_length": 951.0, "epoch": 0.3660436137071651, "grad_norm": 0.5786914825439453, "kl": 0.03941281884908676, "learning_rate": 1.86e-06, "loss": 0.0094, "num_tokens": 30121118.0, "reward": 1.4925031661987305, "reward_std": 0.10374214500188828, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5044078826904297, "rewards/correct_reward_func/std": 0.15605290234088898, "step": 235 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2486.0, "completions/max_terminated_length": 2486.0, "completions/mean_length": 1611.0238037109375, "completions/mean_terminated_length": 1611.0238037109375, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 0.367601246105919, "grad_norm": 0.5391596555709839, "kl": 0.0409199483692646, "learning_rate": 1.8593749999999999e-06, "loss": 0.0334, "num_tokens": 30262456.0, "reward": 1.4530733823776245, "reward_std": 0.06101413816213608, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4530732333660126, "rewards/correct_reward_func/std": 0.17970231175422668, "step": 236 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2376.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 1591.416748046875, "completions/mean_terminated_length": 1591.416748046875, "completions/min_length": 1109.0, "completions/min_terminated_length": 1109.0, "epoch": 0.3691588785046729, "grad_norm": 0.595643937587738, "kl": 0.04340810887515545, "learning_rate": 1.8587499999999998e-06, "loss": 0.0072, "num_tokens": 30402333.0, "reward": 1.46660578250885, "reward_std": 0.06254373490810394, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46660569310188293, "rewards/correct_reward_func/std": 0.12236826121807098, "step": 237 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2291.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 1531.9881591796875, "completions/mean_terminated_length": 1531.9881591796875, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 0.3707165109034268, "grad_norm": 0.5581408143043518, "kl": 0.04236830584704876, "learning_rate": 1.8581249999999999e-06, "loss": 0.0098, "num_tokens": 30537140.0, "reward": 1.5432277917861938, "reward_std": 0.0626709833741188, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5432276129722595, "rewards/correct_reward_func/std": 0.14388269186019897, "step": 238 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2228.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 1550.797607421875, "completions/mean_terminated_length": 1550.797607421875, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 0.37227414330218067, "grad_norm": 0.574525773525238, "kl": 0.04116277024149895, "learning_rate": 1.8574999999999998e-06, "loss": -0.0023, "num_tokens": 30673653.0, "reward": 1.4574679136276245, "reward_std": 0.06034516915678978, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4574679434299469, "rewards/correct_reward_func/std": 0.12509454786777496, "step": 239 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 1503.8929443359375, "completions/mean_terminated_length": 1503.8929443359375, "completions/min_length": 1007.0, "completions/min_terminated_length": 1007.0, "epoch": 0.37383177570093457, "grad_norm": 0.5814647674560547, "kl": 0.04194348491728306, "learning_rate": 1.856875e-06, "loss": 0.0009, "num_tokens": 30806058.0, "reward": 1.4536018371582031, "reward_std": 0.0663735568523407, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45360174775123596, "rewards/correct_reward_func/std": 0.11585589498281479, "step": 240 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 1468.6785888671875, "completions/mean_terminated_length": 1468.6785888671875, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "epoch": 0.37538940809968846, "grad_norm": 0.5900517702102661, "kl": 0.041691072285175323, "learning_rate": 1.8562499999999998e-06, "loss": 0.0052, "num_tokens": 30935385.0, "reward": 1.4337111711502075, "reward_std": 0.0778563842177391, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4456157684326172, "rewards/correct_reward_func/std": 0.13722439110279083, "step": 241 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2428.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 1410.9405517578125, "completions/mean_terminated_length": 1410.9405517578125, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.37694704049844235, "grad_norm": 0.617452085018158, "kl": 0.04172271862626076, "learning_rate": 1.855625e-06, "loss": 0.0126, "num_tokens": 31059784.0, "reward": 1.4357478618621826, "reward_std": 0.08216311782598495, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44765254855155945, "rewards/correct_reward_func/std": 0.13791020214557648, "step": 242 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2206.0, "completions/max_terminated_length": 2206.0, "completions/mean_length": 1451.5833740234375, "completions/mean_terminated_length": 1451.5833740234375, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "epoch": 0.37850467289719625, "grad_norm": 0.5894716382026672, "kl": 0.040936123579740524, "learning_rate": 1.8549999999999998e-06, "loss": 0.0101, "num_tokens": 31187819.0, "reward": 1.533345341682434, "reward_std": 0.07711285352706909, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5333453416824341, "rewards/correct_reward_func/std": 0.12535390257835388, "step": 243 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 1480.15478515625, "completions/mean_terminated_length": 1480.15478515625, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "epoch": 0.38006230529595014, "grad_norm": 0.613828182220459, "kl": 0.043313439935445786, "learning_rate": 1.854375e-06, "loss": -0.0173, "num_tokens": 31318278.0, "reward": 1.4279972314834595, "reward_std": 0.06592278927564621, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4279972314834595, "rewards/correct_reward_func/std": 0.1297278255224228, "step": 244 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2105.0, "completions/max_terminated_length": 2105.0, "completions/mean_length": 1363.1905517578125, "completions/mean_terminated_length": 1363.1905517578125, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "epoch": 0.38161993769470404, "grad_norm": 0.6236963272094727, "kl": 0.04276050627231598, "learning_rate": 1.8537499999999998e-06, "loss": 0.0313, "num_tokens": 31438636.0, "reward": 1.4753836393356323, "reward_std": 0.09151271730661392, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48728832602500916, "rewards/correct_reward_func/std": 0.1840963512659073, "step": 245 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2239.0, "completions/mean_length": 1526.7857666015625, "completions/mean_terminated_length": 1446.48193359375, "completions/min_length": 631.0, "completions/min_terminated_length": 631.0, "epoch": 0.38317757009345793, "grad_norm": 0.5810291171073914, "kl": 0.04004097357392311, "learning_rate": 1.8531249999999997e-06, "loss": 0.0546, "num_tokens": 31573048.0, "reward": 1.4642269611358643, "reward_std": 0.07142822444438934, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46422696113586426, "rewards/correct_reward_func/std": 0.13529928028583527, "step": 246 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 1418.5595703125, "completions/mean_terminated_length": 1418.5595703125, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.3847352024922118, "grad_norm": 0.594132125377655, "kl": 0.04153658635914326, "learning_rate": 1.8525e-06, "loss": 0.0037, "num_tokens": 31698363.0, "reward": 1.4876208305358887, "reward_std": 0.05413410812616348, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4876207709312439, "rewards/correct_reward_func/std": 0.134954035282135, "step": 247 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2087.0, "completions/max_terminated_length": 2087.0, "completions/mean_length": 1418.4285888671875, "completions/mean_terminated_length": 1418.4285888671875, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 0.3862928348909657, "grad_norm": 0.6255624890327454, "kl": 0.04057171009480953, "learning_rate": 1.851875e-06, "loss": 0.0215, "num_tokens": 31823583.0, "reward": 1.5005978345870972, "reward_std": 0.07266637682914734, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5125024914741516, "rewards/correct_reward_func/std": 0.1100957989692688, "step": 248 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2308.0, "completions/max_terminated_length": 2308.0, "completions/mean_length": 1451.107177734375, "completions/mean_terminated_length": 1451.107177734375, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.3878504672897196, "grad_norm": 0.5847201943397522, "kl": 0.04207037389278412, "learning_rate": 1.85125e-06, "loss": 0.0091, "num_tokens": 31951524.0, "reward": 1.4142539501190186, "reward_std": 0.05806390568614006, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41425377130508423, "rewards/correct_reward_func/std": 0.11380590498447418, "step": 249 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2403.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 1413.1905517578125, "completions/mean_terminated_length": 1413.1905517578125, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.3894080996884735, "grad_norm": 0.6250362396240234, "kl": 0.04115402512252331, "learning_rate": 1.850625e-06, "loss": -0.0033, "num_tokens": 32076256.0, "reward": 1.4924920797348022, "reward_std": 0.08884865790605545, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5043967962265015, "rewards/correct_reward_func/std": 0.13221827149391174, "step": 250 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2204.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 1406.2261962890625, "completions/mean_terminated_length": 1406.2261962890625, "completions/min_length": 703.0, "completions/min_terminated_length": 703.0, "epoch": 0.3909657320872274, "grad_norm": 0.6414059996604919, "kl": 0.0399419330060482, "learning_rate": 1.85e-06, "loss": 0.0096, "num_tokens": 32200295.0, "reward": 1.4972379207611084, "reward_std": 0.07513421773910522, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4972379803657532, "rewards/correct_reward_func/std": 0.14758117496967316, "step": 251 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3084.0, "completions/max_terminated_length": 3084.0, "completions/mean_length": 1468.4285888671875, "completions/mean_terminated_length": 1468.4285888671875, "completions/min_length": 940.0, "completions/min_terminated_length": 940.0, "epoch": 0.3925233644859813, "grad_norm": 0.5872684717178345, "kl": 0.04071245715022087, "learning_rate": 1.849375e-06, "loss": -0.0157, "num_tokens": 32329739.0, "reward": 1.4793757200241089, "reward_std": 0.07921571284532547, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4793757498264313, "rewards/correct_reward_func/std": 0.14062678813934326, "step": 252 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1970.0, "completions/max_terminated_length": 1970.0, "completions/mean_length": 1309.952392578125, "completions/mean_terminated_length": 1309.952392578125, "completions/min_length": 727.0, "completions/min_terminated_length": 727.0, "epoch": 0.3940809968847352, "grad_norm": 0.6398329138755798, "kl": 0.042467374354600906, "learning_rate": 1.8487499999999999e-06, "loss": -0.001, "num_tokens": 32445493.0, "reward": 1.4916237592697144, "reward_std": 0.10208263248205185, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5035284757614136, "rewards/correct_reward_func/std": 0.2051754891872406, "step": 253 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 1518.261962890625, "completions/mean_terminated_length": 1437.8553466796875, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 0.3956386292834891, "grad_norm": 0.6442155838012695, "kl": 0.04048139229416847, "learning_rate": 1.848125e-06, "loss": 0.0944, "num_tokens": 32579225.0, "reward": 1.4657591581344604, "reward_std": 0.10077626258134842, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4776638448238373, "rewards/correct_reward_func/std": 0.15037497878074646, "step": 254 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2150.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 1396.46435546875, "completions/mean_terminated_length": 1396.46435546875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.397196261682243, "grad_norm": 0.5844140648841858, "kl": 0.03995479829609394, "learning_rate": 1.8474999999999999e-06, "loss": -0.0318, "num_tokens": 32702402.0, "reward": 1.4533027410507202, "reward_std": 0.11024706810712814, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4652075171470642, "rewards/correct_reward_func/std": 0.15434937179088593, "step": 255 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2225.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 1392.0357666015625, "completions/mean_terminated_length": 1392.0357666015625, "completions/min_length": 937.0, "completions/min_terminated_length": 937.0, "epoch": 0.3987538940809969, "grad_norm": 0.6156937479972839, "kl": 0.04135890118777752, "learning_rate": 1.846875e-06, "loss": -0.0274, "num_tokens": 32825423.0, "reward": 1.4397926330566406, "reward_std": 0.13189160823822021, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.46360212564468384, "rewards/correct_reward_func/std": 0.11824122816324234, "step": 256 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 1323.09521484375, "completions/mean_terminated_length": 1323.09521484375, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.40031152647975077, "grad_norm": 0.6653797626495361, "kl": 0.041794365271925926, "learning_rate": 1.84625e-06, "loss": 0.0155, "num_tokens": 32942683.0, "reward": 1.4287505149841309, "reward_std": 0.10937704890966415, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.45256009697914124, "rewards/correct_reward_func/std": 0.1507876217365265, "step": 257 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2162.0, "completions/max_terminated_length": 2162.0, "completions/mean_length": 1432.6905517578125, "completions/mean_terminated_length": 1432.6905517578125, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "epoch": 0.40186915887850466, "grad_norm": 0.5770981311798096, "kl": 0.04339606128633022, "learning_rate": 1.845625e-06, "loss": -0.0066, "num_tokens": 33068987.0, "reward": 1.435407280921936, "reward_std": 0.10868566483259201, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.459216833114624, "rewards/correct_reward_func/std": 0.13378530740737915, "step": 258 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 1342.0, "completions/mean_terminated_length": 1342.0, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "epoch": 0.40342679127725856, "grad_norm": 0.5935827493667603, "kl": 0.0418770182877779, "learning_rate": 1.845e-06, "loss": -0.0092, "num_tokens": 33187595.0, "reward": 1.4948703050613403, "reward_std": 0.06175254285335541, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4948703348636627, "rewards/correct_reward_func/std": 0.14253027737140656, "step": 259 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2052.0, "completions/max_terminated_length": 2052.0, "completions/mean_length": 1342.392822265625, "completions/mean_terminated_length": 1342.392822265625, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 0.40498442367601245, "grad_norm": 0.6045507192611694, "kl": 0.04008128307759762, "learning_rate": 1.844375e-06, "loss": 0.017, "num_tokens": 33306416.0, "reward": 1.4627269506454468, "reward_std": 0.11455470323562622, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.48653644323349, "rewards/correct_reward_func/std": 0.13522200286388397, "step": 260 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2129.0, "completions/max_terminated_length": 2129.0, "completions/mean_length": 1361.892822265625, "completions/mean_terminated_length": 1361.892822265625, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 0.40654205607476634, "grad_norm": 0.6531806588172913, "kl": 0.043158069252967834, "learning_rate": 1.84375e-06, "loss": -0.0002, "num_tokens": 33426683.0, "reward": 1.4994860887527466, "reward_std": 0.056075319647789, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4994860291481018, "rewards/correct_reward_func/std": 0.13296645879745483, "step": 261 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2196.0, "completions/max_terminated_length": 2196.0, "completions/mean_length": 1350.40478515625, "completions/mean_terminated_length": 1350.40478515625, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.40809968847352024, "grad_norm": 0.6397004723548889, "kl": 0.042096974328160286, "learning_rate": 1.8431249999999998e-06, "loss": 0.0194, "num_tokens": 33546147.0, "reward": 1.5182113647460938, "reward_std": 0.09466809034347534, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5420209169387817, "rewards/correct_reward_func/std": 0.1314767450094223, "step": 262 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2366.0, "completions/max_terminated_length": 2366.0, "completions/mean_length": 1378.107177734375, "completions/mean_terminated_length": 1378.107177734375, "completions/min_length": 854.0, "completions/min_terminated_length": 854.0, "epoch": 0.40965732087227413, "grad_norm": 0.6153465509414673, "kl": 0.04178653843700886, "learning_rate": 1.8425e-06, "loss": 0.0061, "num_tokens": 33667932.0, "reward": 1.5274070501327515, "reward_std": 0.06379646062850952, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5274069309234619, "rewards/correct_reward_func/std": 0.11842171102762222, "step": 263 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2146.0, "completions/max_terminated_length": 2146.0, "completions/mean_length": 1360.357177734375, "completions/mean_terminated_length": 1360.357177734375, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 0.411214953271028, "grad_norm": 0.6154939532279968, "kl": 0.04235922172665596, "learning_rate": 1.8418749999999998e-06, "loss": 0.0078, "num_tokens": 33788220.0, "reward": 1.4856061935424805, "reward_std": 0.09970905631780624, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5094156861305237, "rewards/correct_reward_func/std": 0.11878927052021027, "step": 264 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2242.0, "completions/max_terminated_length": 2242.0, "completions/mean_length": 1453.40478515625, "completions/mean_terminated_length": 1453.40478515625, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.4127725856697819, "grad_norm": 0.5895340442657471, "kl": 0.041117291897535324, "learning_rate": 1.84125e-06, "loss": 0.0121, "num_tokens": 33916336.0, "reward": 1.5335173606872559, "reward_std": 0.06757655739784241, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5335173010826111, "rewards/correct_reward_func/std": 0.15610700845718384, "step": 265 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2147.0, "completions/max_terminated_length": 2147.0, "completions/mean_length": 1328.607177734375, "completions/mean_terminated_length": 1328.607177734375, "completions/min_length": 617.0, "completions/min_terminated_length": 617.0, "epoch": 0.4143302180685358, "grad_norm": 0.608429491519928, "kl": 0.04289627820253372, "learning_rate": 1.8406249999999998e-06, "loss": 0.0158, "num_tokens": 34033747.0, "reward": 1.4430692195892334, "reward_std": 0.05843156576156616, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.443069189786911, "rewards/correct_reward_func/std": 0.1901571899652481, "step": 266 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2294.0, "completions/max_terminated_length": 2294.0, "completions/mean_length": 1355.8809814453125, "completions/mean_terminated_length": 1355.8809814453125, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "epoch": 0.4158878504672897, "grad_norm": 0.6520563960075378, "kl": 0.04428970441222191, "learning_rate": 1.84e-06, "loss": 0.0026, "num_tokens": 34153503.0, "reward": 1.441379427909851, "reward_std": 0.11828587204217911, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4651888906955719, "rewards/correct_reward_func/std": 0.1112525463104248, "step": 267 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1398.202392578125, "completions/mean_terminated_length": 1398.202392578125, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "epoch": 0.4174454828660436, "grad_norm": 0.6122962236404419, "kl": 0.043515296652913094, "learning_rate": 1.8393749999999999e-06, "loss": -0.0184, "num_tokens": 34277012.0, "reward": 1.4698716402053833, "reward_std": 0.10509771853685379, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4936811327934265, "rewards/correct_reward_func/std": 0.1476019024848938, "step": 268 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3903.0, "completions/max_terminated_length": 3903.0, "completions/mean_length": 1497.8929443359375, "completions/mean_terminated_length": 1497.8929443359375, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 0.4190031152647975, "grad_norm": 0.5924271941184998, "kl": 0.04205969721078873, "learning_rate": 1.83875e-06, "loss": 0.0098, "num_tokens": 34408937.0, "reward": 1.509368896484375, "reward_std": 0.09606263041496277, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5212737321853638, "rewards/correct_reward_func/std": 0.1800881028175354, "step": 269 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1868.0, "completions/max_terminated_length": 1868.0, "completions/mean_length": 1369.9285888671875, "completions/mean_terminated_length": 1369.9285888671875, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 0.4205607476635514, "grad_norm": 0.6304810047149658, "kl": 0.04325720854103565, "learning_rate": 1.8381249999999999e-06, "loss": 0.0025, "num_tokens": 34530017.0, "reward": 1.5110054016113281, "reward_std": 0.0600060299038887, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5110054016113281, "rewards/correct_reward_func/std": 0.1884339600801468, "step": 270 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2724.0, "completions/max_terminated_length": 2724.0, "completions/mean_length": 1406.3214111328125, "completions/mean_terminated_length": 1406.3214111328125, "completions/min_length": 413.0, "completions/min_terminated_length": 413.0, "epoch": 0.4221183800623053, "grad_norm": 0.6104097366333008, "kl": 0.041636811569333076, "learning_rate": 1.8374999999999998e-06, "loss": -0.0099, "num_tokens": 34654028.0, "reward": 1.5112788677215576, "reward_std": 0.07226122170686722, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5112787485122681, "rewards/correct_reward_func/std": 0.16975651681423187, "step": 271 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2353.0, "completions/max_terminated_length": 2353.0, "completions/mean_length": 1444.3690185546875, "completions/mean_terminated_length": 1444.3690185546875, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 0.4236760124610592, "grad_norm": 0.6154975295066833, "kl": 0.045218756422400475, "learning_rate": 1.8368749999999999e-06, "loss": 0.012, "num_tokens": 34781289.0, "reward": 1.570892333984375, "reward_std": 0.06964144110679626, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.570892333984375, "rewards/correct_reward_func/std": 0.17510028183460236, "step": 272 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2181.0, "completions/max_terminated_length": 2181.0, "completions/mean_length": 1431.8333740234375, "completions/mean_terminated_length": 1431.8333740234375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "epoch": 0.4252336448598131, "grad_norm": 0.6236024498939514, "kl": 0.04151295870542526, "learning_rate": 1.8362499999999998e-06, "loss": -0.022, "num_tokens": 34907617.0, "reward": 1.4521212577819824, "reward_std": 0.12464414536952972, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.47593072056770325, "rewards/correct_reward_func/std": 0.20174725353717804, "step": 273 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2056.0, "completions/max_terminated_length": 2056.0, "completions/mean_length": 1358.96435546875, "completions/mean_terminated_length": 1358.96435546875, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "epoch": 0.42679127725856697, "grad_norm": 0.6278449892997742, "kl": 0.04468250274658203, "learning_rate": 1.835625e-06, "loss": -0.0027, "num_tokens": 35027644.0, "reward": 1.4580986499786377, "reward_std": 0.10002487152814865, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4700033664703369, "rewards/correct_reward_func/std": 0.13392986357212067, "step": 274 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2506.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 1500.6190185546875, "completions/mean_terminated_length": 1500.6190185546875, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.42834890965732086, "grad_norm": 0.5943901538848877, "kl": 0.04206756874918938, "learning_rate": 1.8349999999999998e-06, "loss": 0.0157, "num_tokens": 35159816.0, "reward": 1.392418622970581, "reward_std": 0.08610358834266663, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4043233394622803, "rewards/correct_reward_func/std": 0.11100338399410248, "step": 275 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 1480.7261962890625, "completions/mean_terminated_length": 1480.7261962890625, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 0.42990654205607476, "grad_norm": 0.5723066329956055, "kl": 0.04631072096526623, "learning_rate": 1.834375e-06, "loss": 0.0086, "num_tokens": 35289993.0, "reward": 1.4739741086959839, "reward_std": 0.09586656838655472, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4858788549900055, "rewards/correct_reward_func/std": 0.15505263209342957, "step": 276 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0357142857142857, "completions/max_length": 8192.0, "completions/max_terminated_length": 3679.0, "completions/mean_length": 1646.666748046875, "completions/mean_terminated_length": 1404.2469482421875, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 0.43146417445482865, "grad_norm": 0.543669581413269, "kl": 0.03733106330037117, "learning_rate": 1.8337499999999998e-06, "loss": 0.182, "num_tokens": 35434073.0, "reward": 1.4567782878875732, "reward_std": 0.13338518142700195, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46868306398391724, "rewards/correct_reward_func/std": 0.16619150340557098, "step": 277 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2141.0, "completions/max_terminated_length": 2141.0, "completions/mean_length": 1501.5595703125, "completions/mean_terminated_length": 1501.5595703125, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.43302180685358255, "grad_norm": 0.6221758723258972, "kl": 0.04269747622311115, "learning_rate": 1.8331249999999997e-06, "loss": 0.0145, "num_tokens": 35566138.0, "reward": 1.478410243988037, "reward_std": 0.0783148929476738, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49031493067741394, "rewards/correct_reward_func/std": 0.1501626968383789, "step": 278 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7459.0, "completions/max_terminated_length": 7459.0, "completions/mean_length": 1598.6785888671875, "completions/mean_terminated_length": 1598.6785888671875, "completions/min_length": 924.0, "completions/min_terminated_length": 924.0, "epoch": 0.43457943925233644, "grad_norm": 0.5859509706497192, "kl": 0.04193317890167236, "learning_rate": 1.8325e-06, "loss": 0.0074, "num_tokens": 35706385.0, "reward": 1.4401181936264038, "reward_std": 0.0550164058804512, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4401181638240814, "rewards/correct_reward_func/std": 0.1537192016839981, "step": 279 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 1499.8095703125, "completions/mean_terminated_length": 1419.1806640625, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 0.43613707165109034, "grad_norm": 0.595498263835907, "kl": 0.04362577013671398, "learning_rate": 1.831875e-06, "loss": 0.0495, "num_tokens": 35838297.0, "reward": 1.4194005727767944, "reward_std": 0.08994495123624802, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4313051104545593, "rewards/correct_reward_func/std": 0.14579959213733673, "step": 280 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3234.0, "completions/max_terminated_length": 3234.0, "completions/mean_length": 1566.0357666015625, "completions/mean_terminated_length": 1566.0357666015625, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.43769470404984423, "grad_norm": 0.5680190920829773, "kl": 0.0422314815223217, "learning_rate": 1.83125e-06, "loss": 0.0167, "num_tokens": 35975880.0, "reward": 1.475471019744873, "reward_std": 0.11540813744068146, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.49928048253059387, "rewards/correct_reward_func/std": 0.1275622546672821, "step": 281 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2062.0, "completions/max_terminated_length": 2062.0, "completions/mean_length": 1490.1309814453125, "completions/mean_terminated_length": 1490.1309814453125, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 0.4392523364485981, "grad_norm": 0.5545284152030945, "kl": 0.044208116829395294, "learning_rate": 1.830625e-06, "loss": -0.0119, "num_tokens": 36106913.0, "reward": 1.467574119567871, "reward_std": 0.09451182931661606, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4794788360595703, "rewards/correct_reward_func/std": 0.15235535800457, "step": 282 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2859.0, "completions/max_terminated_length": 2859.0, "completions/mean_length": 1569.4405517578125, "completions/mean_terminated_length": 1569.4405517578125, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "epoch": 0.440809968847352, "grad_norm": 0.5956711173057556, "kl": 0.04277007095515728, "learning_rate": 1.83e-06, "loss": -0.0127, "num_tokens": 36244560.0, "reward": 1.465549111366272, "reward_std": 0.0775151252746582, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4774538576602936, "rewards/correct_reward_func/std": 0.18763205409049988, "step": 283 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3013.0, "completions/max_terminated_length": 3013.0, "completions/mean_length": 1505.1905517578125, "completions/mean_terminated_length": 1505.1905517578125, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.4423676012461059, "grad_norm": 0.613337516784668, "kl": 0.04313294030725956, "learning_rate": 1.829375e-06, "loss": -0.0096, "num_tokens": 36376960.0, "reward": 1.51215660572052, "reward_std": 0.07286342978477478, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5121565461158752, "rewards/correct_reward_func/std": 0.17705413699150085, "step": 284 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2579.0, "completions/max_terminated_length": 2579.0, "completions/mean_length": 1624.3809814453125, "completions/mean_terminated_length": 1624.3809814453125, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.4439252336448598, "grad_norm": 0.5604744553565979, "kl": 0.04310506582260132, "learning_rate": 1.82875e-06, "loss": 0.008, "num_tokens": 36519414.0, "reward": 1.4455859661102295, "reward_std": 0.07997937500476837, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45749059319496155, "rewards/correct_reward_func/std": 0.15531718730926514, "step": 285 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 1497.3809814453125, "completions/mean_terminated_length": 1497.3809814453125, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.4454828660436137, "grad_norm": 0.6142125129699707, "kl": 0.04405369609594345, "learning_rate": 1.828125e-06, "loss": 0.0213, "num_tokens": 36651170.0, "reward": 1.4967048168182373, "reward_std": 0.05460391938686371, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49670469760894775, "rewards/correct_reward_func/std": 0.11759886145591736, "step": 286 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2114.0, "completions/max_terminated_length": 2114.0, "completions/mean_length": 1550.3809814453125, "completions/mean_terminated_length": 1550.3809814453125, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "epoch": 0.4470404984423676, "grad_norm": 0.5982187390327454, "kl": 0.043098822236061096, "learning_rate": 1.8274999999999999e-06, "loss": 0.0141, "num_tokens": 36787384.0, "reward": 1.5390734672546387, "reward_std": 0.07396355271339417, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5390734672546387, "rewards/correct_reward_func/std": 0.15539847314357758, "step": 287 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2471.0, "completions/mean_length": 1590.9285888671875, "completions/mean_terminated_length": 1511.3975830078125, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "epoch": 0.4485981308411215, "grad_norm": 0.5779162645339966, "kl": 0.04267328046262264, "learning_rate": 1.826875e-06, "loss": 0.0529, "num_tokens": 36926968.0, "reward": 1.4345697164535522, "reward_std": 0.06787623465061188, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4345696270465851, "rewards/correct_reward_func/std": 0.1536804884672165, "step": 288 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2519.0, "completions/max_terminated_length": 2519.0, "completions/mean_length": 1572.46435546875, "completions/mean_terminated_length": 1572.46435546875, "completions/min_length": 619.0, "completions/min_terminated_length": 619.0, "epoch": 0.4501557632398754, "grad_norm": 0.5400437116622925, "kl": 0.0447152704000473, "learning_rate": 1.8262499999999999e-06, "loss": -0.0149, "num_tokens": 37065001.0, "reward": 1.4242392778396606, "reward_std": 0.0718853771686554, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42423921823501587, "rewards/correct_reward_func/std": 0.12876558303833008, "step": 289 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2585.0, "completions/max_terminated_length": 2585.0, "completions/mean_length": 1544.71435546875, "completions/mean_terminated_length": 1544.71435546875, "completions/min_length": 554.0, "completions/min_terminated_length": 554.0, "epoch": 0.4517133956386293, "grad_norm": 0.5619280338287354, "kl": 0.044344568625092506, "learning_rate": 1.825625e-06, "loss": 0.0099, "num_tokens": 37200853.0, "reward": 1.4639300107955933, "reward_std": 0.1035037636756897, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47583478689193726, "rewards/correct_reward_func/std": 0.17393389344215393, "step": 290 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2626.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 1503.4761962890625, "completions/mean_terminated_length": 1503.4761962890625, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 0.4532710280373832, "grad_norm": 0.5982638001441956, "kl": 0.04427545331418514, "learning_rate": 1.8249999999999999e-06, "loss": -0.0342, "num_tokens": 37332971.0, "reward": 1.4380909204483032, "reward_std": 0.06856860220432281, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43809083104133606, "rewards/correct_reward_func/std": 0.13657042384147644, "step": 291 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 1529.8214111328125, "completions/mean_terminated_length": 1529.8214111328125, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 0.45482866043613707, "grad_norm": 0.6071525812149048, "kl": 0.04384468495845795, "learning_rate": 1.824375e-06, "loss": 0.0026, "num_tokens": 37467494.0, "reward": 1.4335728883743286, "reward_std": 0.04877452179789543, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43357276916503906, "rewards/correct_reward_func/std": 0.14995594322681427, "step": 292 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2314.0, "completions/max_terminated_length": 2314.0, "completions/mean_length": 1446.8690185546875, "completions/mean_terminated_length": 1446.8690185546875, "completions/min_length": 732.0, "completions/min_terminated_length": 732.0, "epoch": 0.45638629283489096, "grad_norm": 0.6094366312026978, "kl": 0.04509362578392029, "learning_rate": 1.82375e-06, "loss": 0.0195, "num_tokens": 37594917.0, "reward": 1.5306507349014282, "reward_std": 0.08785208314657211, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5306507349014282, "rewards/correct_reward_func/std": 0.14832548797130585, "step": 293 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 1544.011962890625, "completions/mean_terminated_length": 1544.011962890625, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.45794392523364486, "grad_norm": 0.6053863167762756, "kl": 0.04364632070064545, "learning_rate": 1.823125e-06, "loss": -0.0119, "num_tokens": 37730644.0, "reward": 1.4835604429244995, "reward_std": 0.057300373911857605, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4835604131221771, "rewards/correct_reward_func/std": 0.17512056231498718, "step": 294 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3292.0, "completions/max_terminated_length": 3292.0, "completions/mean_length": 1579.09521484375, "completions/mean_terminated_length": 1579.09521484375, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 0.45950155763239875, "grad_norm": 0.5880759954452515, "kl": 0.0431599710136652, "learning_rate": 1.8225e-06, "loss": -0.0029, "num_tokens": 37869492.0, "reward": 1.462117314338684, "reward_std": 0.05300503969192505, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46211737394332886, "rewards/correct_reward_func/std": 0.11961612105369568, "step": 295 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2367.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 1529.5, "completions/mean_terminated_length": 1529.5, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "epoch": 0.46105919003115264, "grad_norm": 0.5763446092605591, "kl": 0.04444514401257038, "learning_rate": 1.8218749999999998e-06, "loss": 0.0078, "num_tokens": 38003970.0, "reward": 1.4610320329666138, "reward_std": 0.08918090909719467, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46103209257125854, "rewards/correct_reward_func/std": 0.13395950198173523, "step": 296 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 1656.46435546875, "completions/mean_terminated_length": 1577.7227783203125, "completions/min_length": 929.0, "completions/min_terminated_length": 929.0, "epoch": 0.46261682242990654, "grad_norm": 0.5834442973136902, "kl": 0.041973644867539406, "learning_rate": 1.82125e-06, "loss": 0.0617, "num_tokens": 38149245.0, "reward": 1.4773956537246704, "reward_std": 0.09391757100820541, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48930031061172485, "rewards/correct_reward_func/std": 0.1633952558040619, "step": 297 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1549.3929443359375, "completions/mean_terminated_length": 1549.3929443359375, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "epoch": 0.46417445482866043, "grad_norm": 0.6317084431648254, "kl": 0.046587640419602394, "learning_rate": 1.8206249999999998e-06, "loss": 0.0175, "num_tokens": 38285280.0, "reward": 1.449130892753601, "reward_std": 0.14152653515338898, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.47294026613235474, "rewards/correct_reward_func/std": 0.14357496798038483, "step": 298 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2408.0, "completions/max_terminated_length": 2408.0, "completions/mean_length": 1490.2738037109375, "completions/mean_terminated_length": 1490.2738037109375, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.4657320872274143, "grad_norm": 0.5916226506233215, "kl": 0.04444451816380024, "learning_rate": 1.82e-06, "loss": -0.0187, "num_tokens": 38416565.0, "reward": 1.4541789293289185, "reward_std": 0.04750651866197586, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45417898893356323, "rewards/correct_reward_func/std": 0.1580149382352829, "step": 299 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1604.3333740234375, "completions/mean_terminated_length": 1524.9637451171875, "completions/min_length": 1040.0, "completions/min_terminated_length": 1040.0, "epoch": 0.4672897196261682, "grad_norm": 0.5734702944755554, "kl": 0.0429048128426075, "learning_rate": 1.8193749999999998e-06, "loss": 0.0538, "num_tokens": 38557047.0, "reward": 1.4508525133132935, "reward_std": 0.12774604558944702, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.47466206550598145, "rewards/correct_reward_func/std": 0.14542065560817719, "step": 300 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2414.0, "completions/max_terminated_length": 2414.0, "completions/mean_length": 1552.4761962890625, "completions/mean_terminated_length": 1552.4761962890625, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 0.4688473520249221, "grad_norm": 0.596815288066864, "kl": 0.043997010216116905, "learning_rate": 1.81875e-06, "loss": 0.0025, "num_tokens": 38693335.0, "reward": 1.4665279388427734, "reward_std": 0.0923430472612381, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4784325957298279, "rewards/correct_reward_func/std": 0.1467050313949585, "step": 301 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5495.0, "completions/max_terminated_length": 5495.0, "completions/mean_length": 1538.1785888671875, "completions/mean_terminated_length": 1538.1785888671875, "completions/min_length": 959.0, "completions/min_terminated_length": 959.0, "epoch": 0.470404984423676, "grad_norm": 0.5941459536552429, "kl": 0.043290507048368454, "learning_rate": 1.8181249999999999e-06, "loss": 0.0057, "num_tokens": 38828434.0, "reward": 1.5739519596099854, "reward_std": 0.07381974905729294, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5739518404006958, "rewards/correct_reward_func/std": 0.1708972305059433, "step": 302 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2525.0, "completions/max_terminated_length": 2525.0, "completions/mean_length": 1535.1190185546875, "completions/mean_terminated_length": 1535.1190185546875, "completions/min_length": 59.0, "completions/min_terminated_length": 59.0, "epoch": 0.4719626168224299, "grad_norm": 0.6311259865760803, "kl": 0.044257769361138344, "learning_rate": 1.8174999999999998e-06, "loss": -0.0331, "num_tokens": 38963522.0, "reward": 1.4050052165985107, "reward_std": 0.12261962890625, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.42881447076797485, "rewards/correct_reward_func/std": 0.15219928324222565, "step": 303 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2573.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 1518.4405517578125, "completions/mean_terminated_length": 1518.4405517578125, "completions/min_length": 951.0, "completions/min_terminated_length": 951.0, "epoch": 0.4735202492211838, "grad_norm": 0.5748085379600525, "kl": 0.0418586116284132, "learning_rate": 1.8168749999999999e-06, "loss": -0.0019, "num_tokens": 39097101.0, "reward": 1.4809808731079102, "reward_std": 0.09693938493728638, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4928855299949646, "rewards/correct_reward_func/std": 0.14456793665885925, "step": 304 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2232.0, "completions/max_terminated_length": 2232.0, "completions/mean_length": 1455.0595703125, "completions/mean_terminated_length": 1455.0595703125, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 0.4750778816199377, "grad_norm": 0.6262243986129761, "kl": 0.04568205960094929, "learning_rate": 1.8162499999999998e-06, "loss": 0.005, "num_tokens": 39225266.0, "reward": 1.5073304176330566, "reward_std": 0.0670827329158783, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5073302984237671, "rewards/correct_reward_func/std": 0.12795594334602356, "step": 305 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 1483.90478515625, "completions/mean_terminated_length": 1483.90478515625, "completions/min_length": 917.0, "completions/min_terminated_length": 917.0, "epoch": 0.4766355140186916, "grad_norm": 0.6198824644088745, "kl": 0.04370650835335255, "learning_rate": 1.8156249999999999e-06, "loss": 0.0063, "num_tokens": 39356004.0, "reward": 1.5359094142913818, "reward_std": 0.04993622750043869, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5359094142913818, "rewards/correct_reward_func/std": 0.13109326362609863, "step": 306 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2527.0, "completions/max_terminated_length": 2527.0, "completions/mean_length": 1559.5, "completions/mean_terminated_length": 1559.5, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.4781931464174455, "grad_norm": 0.5853797197341919, "kl": 0.04342350363731384, "learning_rate": 1.8149999999999998e-06, "loss": 0.0145, "num_tokens": 39493110.0, "reward": 1.4782713651657104, "reward_std": 0.08605591952800751, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49017614126205444, "rewards/correct_reward_func/std": 0.1508086770772934, "step": 307 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2326.0, "completions/max_terminated_length": 2326.0, "completions/mean_length": 1511.09521484375, "completions/mean_terminated_length": 1511.09521484375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.4797507788161994, "grad_norm": 0.5664383769035339, "kl": 0.04395752586424351, "learning_rate": 1.8143749999999999e-06, "loss": 0.0426, "num_tokens": 39625916.0, "reward": 1.4828287363052368, "reward_std": 0.06932734698057175, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48282867670059204, "rewards/correct_reward_func/std": 0.17860376834869385, "step": 308 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2080.0, "completions/max_terminated_length": 2080.0, "completions/mean_length": 1484.916748046875, "completions/mean_terminated_length": 1484.916748046875, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.48130841121495327, "grad_norm": 0.617337167263031, "kl": 0.043506965041160583, "learning_rate": 1.8137499999999998e-06, "loss": -0.0018, "num_tokens": 39756451.0, "reward": 1.4927843809127808, "reward_std": 0.10661379992961884, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.50468909740448, "rewards/correct_reward_func/std": 0.1741182804107666, "step": 309 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2651.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 1531.8214111328125, "completions/mean_terminated_length": 1531.8214111328125, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 0.48286604361370716, "grad_norm": 0.635550320148468, "kl": 0.045443542301654816, "learning_rate": 1.8131250000000001e-06, "loss": -0.0145, "num_tokens": 39891016.0, "reward": 1.4746626615524292, "reward_std": 0.062213968485593796, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4746626019477844, "rewards/correct_reward_func/std": 0.1761779487133026, "step": 310 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2577.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 1529.7381591796875, "completions/mean_terminated_length": 1529.7381591796875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.48442367601246106, "grad_norm": 0.573308527469635, "kl": 0.04426569677889347, "learning_rate": 1.8125e-06, "loss": -0.0208, "num_tokens": 40025544.0, "reward": 1.506926417350769, "reward_std": 0.07785354554653168, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.506926417350769, "rewards/correct_reward_func/std": 0.15344847738742828, "step": 311 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2376.0, "completions/max_terminated_length": 2376.0, "completions/mean_length": 1534.297607421875, "completions/mean_terminated_length": 1534.297607421875, "completions/min_length": 992.0, "completions/min_terminated_length": 992.0, "epoch": 0.48598130841121495, "grad_norm": 0.5884522199630737, "kl": 0.04365627467632294, "learning_rate": 1.811875e-06, "loss": -0.0051, "num_tokens": 40160329.0, "reward": 1.5241272449493408, "reward_std": 0.08640160411596298, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5241272449493408, "rewards/correct_reward_func/std": 0.1817137748003006, "step": 312 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2641.0, "completions/max_terminated_length": 2641.0, "completions/mean_length": 1530.40478515625, "completions/mean_terminated_length": 1530.40478515625, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.48753894080996885, "grad_norm": 0.6008781790733337, "kl": 0.04319826699793339, "learning_rate": 1.81125e-06, "loss": 0.0087, "num_tokens": 40294919.0, "reward": 1.5073949098587036, "reward_std": 0.06965342164039612, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5073949098587036, "rewards/correct_reward_func/std": 0.17690497636795044, "step": 313 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2659.0, "completions/max_terminated_length": 2659.0, "completions/mean_length": 1450.0, "completions/mean_terminated_length": 1450.0, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.48909657320872274, "grad_norm": 0.728448748588562, "kl": 0.044476715847849846, "learning_rate": 1.810625e-06, "loss": 0.0265, "num_tokens": 40422653.0, "reward": 1.4225661754608154, "reward_std": 0.1585291177034378, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.458280473947525, "rewards/correct_reward_func/std": 0.17805902659893036, "step": 314 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2806.0, "completions/max_terminated_length": 2806.0, "completions/mean_length": 1425.8929443359375, "completions/mean_terminated_length": 1425.8929443359375, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 0.49065420560747663, "grad_norm": 0.6005982160568237, "kl": 0.043727852404117584, "learning_rate": 1.81e-06, "loss": 0.0128, "num_tokens": 40548212.0, "reward": 1.447396993637085, "reward_std": 0.0689636841416359, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4473969638347626, "rewards/correct_reward_func/std": 0.10822274535894394, "step": 315 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 1671.4761962890625, "completions/mean_terminated_length": 1592.9156494140625, "completions/min_length": 1111.0, "completions/min_terminated_length": 1111.0, "epoch": 0.49221183800623053, "grad_norm": 0.5339775085449219, "kl": 0.04142884351313114, "learning_rate": 1.809375e-06, "loss": 0.0495, "num_tokens": 40694814.0, "reward": 1.5396287441253662, "reward_std": 0.06713546812534332, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5396286845207214, "rewards/correct_reward_func/std": 0.16389040648937225, "step": 316 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2193.0, "completions/max_terminated_length": 2193.0, "completions/mean_length": 1511.6190185546875, "completions/mean_terminated_length": 1511.6190185546875, "completions/min_length": 689.0, "completions/min_terminated_length": 689.0, "epoch": 0.4937694704049844, "grad_norm": 0.5666081309318542, "kl": 0.04457671754062176, "learning_rate": 1.80875e-06, "loss": -0.0035, "num_tokens": 40827688.0, "reward": 1.4729593992233276, "reward_std": 0.06596960127353668, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47295936942100525, "rewards/correct_reward_func/std": 0.18029561638832092, "step": 317 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2383.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 1499.1309814453125, "completions/mean_terminated_length": 1499.1309814453125, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "epoch": 0.4953271028037383, "grad_norm": 0.5493736863136292, "kl": 0.04210697114467621, "learning_rate": 1.808125e-06, "loss": 0.0096, "num_tokens": 40959555.0, "reward": 1.4608927965164185, "reward_std": 0.05956989526748657, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46089282631874084, "rewards/correct_reward_func/std": 0.13776575028896332, "step": 318 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 1471.8214111328125, "completions/mean_terminated_length": 1471.8214111328125, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 0.4968847352024922, "grad_norm": 0.5784661173820496, "kl": 0.04475216940045357, "learning_rate": 1.8075e-06, "loss": 0.003, "num_tokens": 41089194.0, "reward": 1.4597842693328857, "reward_std": 0.06170998513698578, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4597841799259186, "rewards/correct_reward_func/std": 0.132298082113266, "step": 319 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2501.0, "completions/max_terminated_length": 2501.0, "completions/mean_length": 1545.2381591796875, "completions/mean_terminated_length": 1545.2381591796875, "completions/min_length": 461.0, "completions/min_terminated_length": 461.0, "epoch": 0.4984423676012461, "grad_norm": 0.5935900211334229, "kl": 0.04243394732475281, "learning_rate": 1.806875e-06, "loss": 0.0376, "num_tokens": 41224964.0, "reward": 1.4219588041305542, "reward_std": 0.07682619988918304, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43386340141296387, "rewards/correct_reward_func/std": 0.117339126765728, "step": 320 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2246.0, "completions/max_terminated_length": 2246.0, "completions/mean_length": 1519.857177734375, "completions/mean_terminated_length": 1519.857177734375, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "epoch": 0.5, "grad_norm": 0.5818554162979126, "kl": 0.042232925072312355, "learning_rate": 1.8062499999999999e-06, "loss": 0.0097, "num_tokens": 41358608.0, "reward": 1.4776239395141602, "reward_std": 0.04936147853732109, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4776238799095154, "rewards/correct_reward_func/std": 0.10135854780673981, "step": 321 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2374.0, "completions/max_terminated_length": 2374.0, "completions/mean_length": 1553.5357666015625, "completions/mean_terminated_length": 1553.5357666015625, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.5015576323987538, "grad_norm": 0.5444162487983704, "kl": 0.04500117152929306, "learning_rate": 1.805625e-06, "loss": 0.0176, "num_tokens": 41494955.0, "reward": 1.4176223278045654, "reward_std": 0.08014075458049774, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4295269250869751, "rewards/correct_reward_func/std": 0.11872898042201996, "step": 322 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 1534.8333740234375, "completions/mean_terminated_length": 1534.8333740234375, "completions/min_length": 1019.0, "completions/min_terminated_length": 1019.0, "epoch": 0.5031152647975078, "grad_norm": 0.6494289040565491, "kl": 0.04654599726200104, "learning_rate": 1.8049999999999999e-06, "loss": -0.0155, "num_tokens": 41630079.0, "reward": 1.550278663635254, "reward_std": 0.0712866261601448, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5502786040306091, "rewards/correct_reward_func/std": 0.11994405835866928, "step": 323 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2504.0, "completions/max_terminated_length": 2504.0, "completions/mean_length": 1490.15478515625, "completions/mean_terminated_length": 1490.15478515625, "completions/min_length": 819.0, "completions/min_terminated_length": 819.0, "epoch": 0.5046728971962616, "grad_norm": 0.5980774760246277, "kl": 0.044281333684921265, "learning_rate": 1.804375e-06, "loss": -0.0166, "num_tokens": 41761258.0, "reward": 1.4597599506378174, "reward_std": 0.06575565785169601, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4597598910331726, "rewards/correct_reward_func/std": 0.13967834413051605, "step": 324 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2893.0, "completions/max_terminated_length": 2893.0, "completions/mean_length": 1490.0833740234375, "completions/mean_terminated_length": 1490.0833740234375, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 0.5062305295950156, "grad_norm": 0.636524498462677, "kl": 0.04570058174431324, "learning_rate": 1.8037499999999999e-06, "loss": 0.0381, "num_tokens": 41892449.0, "reward": 1.473886251449585, "reward_std": 0.08418666571378708, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4738861322402954, "rewards/correct_reward_func/std": 0.11435925960540771, "step": 325 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 1402.40478515625, "completions/mean_terminated_length": 1402.40478515625, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "epoch": 0.5077881619937694, "grad_norm": 0.6131139397621155, "kl": 0.04529164917767048, "learning_rate": 1.803125e-06, "loss": 0.0006, "num_tokens": 42016257.0, "reward": 1.4442152976989746, "reward_std": 0.06880811601877213, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4442150890827179, "rewards/correct_reward_func/std": 0.15375681221485138, "step": 326 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 1520.6785888671875, "completions/mean_terminated_length": 1440.3011474609375, "completions/min_length": 868.0, "completions/min_terminated_length": 868.0, "epoch": 0.5093457943925234, "grad_norm": 0.5682876706123352, "kl": 0.043439922854304314, "learning_rate": 1.8025e-06, "loss": 0.0475, "num_tokens": 42150042.0, "reward": 1.4249800443649292, "reward_std": 0.09845460206270218, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43688473105430603, "rewards/correct_reward_func/std": 0.1302318572998047, "step": 327 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1349.4285888671875, "completions/mean_terminated_length": 1349.4285888671875, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.5109034267912772, "grad_norm": 0.569858729839325, "kl": 0.043210411444306374, "learning_rate": 1.8018749999999998e-06, "loss": -0.0357, "num_tokens": 42269154.0, "reward": 1.5177992582321167, "reward_std": 0.08048205822706223, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5177991986274719, "rewards/correct_reward_func/std": 0.15029731392860413, "step": 328 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2338.0, "completions/mean_length": 1542.7261962890625, "completions/mean_terminated_length": 1462.6143798828125, "completions/min_length": 817.0, "completions/min_terminated_length": 817.0, "epoch": 0.5124610591900312, "grad_norm": 0.5456616878509521, "kl": 0.07378330640494823, "learning_rate": 1.80125e-06, "loss": 0.0514, "num_tokens": 42404617.0, "reward": 1.4895997047424316, "reward_std": 0.07681519538164139, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4895995855331421, "rewards/correct_reward_func/std": 0.2211972177028656, "step": 329 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1804.0, "completions/max_terminated_length": 1804.0, "completions/mean_length": 1356.261962890625, "completions/mean_terminated_length": 1356.261962890625, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.514018691588785, "grad_norm": 0.6183797121047974, "kl": 0.04728836566209793, "learning_rate": 1.8006249999999998e-06, "loss": -0.0147, "num_tokens": 42524399.0, "reward": 1.4941986799240112, "reward_std": 0.06612447649240494, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49419865012168884, "rewards/correct_reward_func/std": 0.12598052620887756, "step": 330 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2059.0, "completions/max_terminated_length": 2059.0, "completions/mean_length": 1400.3333740234375, "completions/mean_terminated_length": 1400.3333740234375, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.5155763239875389, "grad_norm": 0.564521849155426, "kl": 0.04574920795857906, "learning_rate": 1.8e-06, "loss": 0.0249, "num_tokens": 42647847.0, "reward": 1.5129057168960571, "reward_std": 0.055123478174209595, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5129056572914124, "rewards/correct_reward_func/std": 0.14364565908908844, "step": 331 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 1540.7857666015625, "completions/mean_terminated_length": 1460.6505126953125, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "epoch": 0.5171339563862928, "grad_norm": 0.6232556700706482, "kl": 0.04752310924232006, "learning_rate": 1.7993749999999998e-06, "loss": 0.0535, "num_tokens": 42783291.0, "reward": 1.4896059036254883, "reward_std": 0.07293432950973511, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4896059036254883, "rewards/correct_reward_func/std": 0.17983748018741608, "step": 332 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1505.9881591796875, "completions/mean_terminated_length": 1425.4337158203125, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "epoch": 0.5186915887850467, "grad_norm": 0.5811641216278076, "kl": 0.043217698112130165, "learning_rate": 1.79875e-06, "loss": 0.022, "num_tokens": 42915812.0, "reward": 1.4960260391235352, "reward_std": 0.0619584396481514, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4960259795188904, "rewards/correct_reward_func/std": 0.13359470665454865, "step": 333 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2062.0, "completions/max_terminated_length": 2062.0, "completions/mean_length": 1459.607177734375, "completions/mean_terminated_length": 1459.607177734375, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.5202492211838006, "grad_norm": 0.5979028344154358, "kl": 0.04545888490974903, "learning_rate": 1.7981249999999998e-06, "loss": 0.0155, "num_tokens": 43044449.0, "reward": 1.4552279710769653, "reward_std": 0.06798920035362244, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45522791147232056, "rewards/correct_reward_func/std": 0.12315916270017624, "step": 334 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 1457.8690185546875, "completions/mean_terminated_length": 1457.8690185546875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5218068535825545, "grad_norm": 0.5863680839538574, "kl": 0.045693760737776756, "learning_rate": 1.7975e-06, "loss": -0.0156, "num_tokens": 43173012.0, "reward": 1.4975252151489258, "reward_std": 0.0669432058930397, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4975251257419586, "rewards/correct_reward_func/std": 0.13803456723690033, "step": 335 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1608.5357666015625, "completions/mean_terminated_length": 1529.216796875, "completions/min_length": 1015.0, "completions/min_terminated_length": 1015.0, "epoch": 0.5233644859813084, "grad_norm": 0.556442379951477, "kl": 0.04240516573190689, "learning_rate": 1.7968749999999998e-06, "loss": 0.0712, "num_tokens": 43314111.0, "reward": 1.51832115650177, "reward_std": 0.1056382805109024, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5302258729934692, "rewards/correct_reward_func/std": 0.1863190233707428, "step": 336 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2391.0, "completions/max_terminated_length": 2391.0, "completions/mean_length": 1453.6905517578125, "completions/mean_terminated_length": 1453.6905517578125, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 0.5249221183800623, "grad_norm": 0.6169154644012451, "kl": 0.04544537514448166, "learning_rate": 1.7962499999999997e-06, "loss": 0.0458, "num_tokens": 43442275.0, "reward": 1.4681013822555542, "reward_std": 0.06039302796125412, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4681013226509094, "rewards/correct_reward_func/std": 0.1258929818868637, "step": 337 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2267.0, "completions/max_terminated_length": 2267.0, "completions/mean_length": 1420.6190185546875, "completions/mean_terminated_length": 1420.6190185546875, "completions/min_length": 858.0, "completions/min_terminated_length": 858.0, "epoch": 0.5264797507788161, "grad_norm": 0.5530162453651428, "kl": 0.04460956156253815, "learning_rate": 1.7956249999999999e-06, "loss": -0.0063, "num_tokens": 43567595.0, "reward": 1.4919437170028687, "reward_std": 0.05034913867712021, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49194350838661194, "rewards/correct_reward_func/std": 0.1505471169948578, "step": 338 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2238.0, "completions/max_terminated_length": 2238.0, "completions/mean_length": 1373.5238037109375, "completions/mean_terminated_length": 1373.5238037109375, "completions/min_length": 797.0, "completions/min_terminated_length": 797.0, "epoch": 0.5280373831775701, "grad_norm": 0.6410908102989197, "kl": 0.04922908917069435, "learning_rate": 1.7949999999999998e-06, "loss": 0.0041, "num_tokens": 43688893.0, "reward": 1.4642657041549683, "reward_std": 0.047354813665151596, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4642656445503235, "rewards/correct_reward_func/std": 0.1483275443315506, "step": 339 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2333.0, "completions/max_terminated_length": 2333.0, "completions/mean_length": 1476.666748046875, "completions/mean_terminated_length": 1476.666748046875, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "epoch": 0.5295950155763239, "grad_norm": 0.5679633021354675, "kl": 0.04452272690832615, "learning_rate": 1.7943749999999999e-06, "loss": 0.0016, "num_tokens": 43819023.0, "reward": 1.4857591390609741, "reward_std": 0.06386592239141464, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48575901985168457, "rewards/correct_reward_func/std": 0.1048179492354393, "step": 340 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2557.0, "completions/mean_length": 1551.3929443359375, "completions/mean_terminated_length": 1471.385498046875, "completions/min_length": 634.0, "completions/min_terminated_length": 634.0, "epoch": 0.5311526479750779, "grad_norm": 0.5662598013877869, "kl": 0.04444094002246857, "learning_rate": 1.79375e-06, "loss": 0.07, "num_tokens": 43955154.0, "reward": 1.4689278602600098, "reward_std": 0.06177349016070366, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4689278304576874, "rewards/correct_reward_func/std": 0.1346137970685959, "step": 341 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1437.9285888671875, "completions/mean_terminated_length": 1437.9285888671875, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.5327102803738317, "grad_norm": 0.6565880179405212, "kl": 0.04598667845129967, "learning_rate": 1.793125e-06, "loss": -0.0001, "num_tokens": 44081778.0, "reward": 1.5010194778442383, "reward_std": 0.06478109210729599, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5010193586349487, "rewards/correct_reward_func/std": 0.18090546131134033, "step": 342 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 1543.09521484375, "completions/mean_terminated_length": 1462.9879150390625, "completions/min_length": 538.0, "completions/min_terminated_length": 538.0, "epoch": 0.5342679127725857, "grad_norm": 0.6114045977592468, "kl": 0.043492890894412994, "learning_rate": 1.7925e-06, "loss": 0.0833, "num_tokens": 44217458.0, "reward": 1.45277738571167, "reward_std": 0.11382251977920532, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.47658684849739075, "rewards/correct_reward_func/std": 0.16254591941833496, "step": 343 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2252.0, "completions/mean_length": 1516.1429443359375, "completions/mean_terminated_length": 1435.7108154296875, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 0.5358255451713395, "grad_norm": 0.5839765667915344, "kl": 0.04525020532310009, "learning_rate": 1.791875e-06, "loss": 0.0436, "num_tokens": 44350790.0, "reward": 1.4360476732254028, "reward_std": 0.06001214683055878, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43604764342308044, "rewards/correct_reward_func/std": 0.1315266191959381, "step": 344 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2188.0, "completions/max_terminated_length": 2188.0, "completions/mean_length": 1429.1785888671875, "completions/mean_terminated_length": 1429.1785888671875, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 0.5373831775700935, "grad_norm": 0.567746639251709, "kl": 0.04449603334069252, "learning_rate": 1.79125e-06, "loss": 0.0274, "num_tokens": 44476895.0, "reward": 1.4312893152236938, "reward_std": 0.0564139224588871, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43128931522369385, "rewards/correct_reward_func/std": 0.13854992389678955, "step": 345 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 1476.25, "completions/mean_terminated_length": 1395.3372802734375, "completions/min_length": 743.0, "completions/min_terminated_length": 743.0, "epoch": 0.5389408099688473, "grad_norm": 0.5948997735977173, "kl": 0.04478558525443077, "learning_rate": 1.790625e-06, "loss": 0.048, "num_tokens": 44606786.0, "reward": 1.4746853113174438, "reward_std": 0.08000284433364868, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4746852219104767, "rewards/correct_reward_func/std": 0.1414322406053543, "step": 346 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 1520.8929443359375, "completions/mean_terminated_length": 1440.51806640625, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 0.5404984423676013, "grad_norm": 0.5873611569404602, "kl": 0.04373046010732651, "learning_rate": 1.79e-06, "loss": 0.0608, "num_tokens": 44740553.0, "reward": 1.3899791240692139, "reward_std": 0.1021641418337822, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4018838405609131, "rewards/correct_reward_func/std": 0.10966146737337112, "step": 347 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3150.0, "completions/mean_length": 1603.3809814453125, "completions/mean_terminated_length": 1524.0, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 0.5420560747663551, "grad_norm": 0.6007145643234253, "kl": 0.042141517624258995, "learning_rate": 1.789375e-06, "loss": 0.0734, "num_tokens": 44881519.0, "reward": 1.516649842262268, "reward_std": 0.06937997788190842, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5166497230529785, "rewards/correct_reward_func/std": 0.13787348568439484, "step": 348 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 1627.0357666015625, "completions/mean_terminated_length": 1466.91455078125, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.543613707165109, "grad_norm": 0.5334845185279846, "kl": 0.04171426221728325, "learning_rate": 1.78875e-06, "loss": 0.142, "num_tokens": 45024094.0, "reward": 1.4035788774490356, "reward_std": 0.07490548491477966, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.40357890725135803, "rewards/correct_reward_func/std": 0.11934227496385574, "step": 349 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2858.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 1388.5238037109375, "completions/mean_terminated_length": 1388.5238037109375, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 0.5451713395638629, "grad_norm": 0.5753508806228638, "kl": 0.04579620808362961, "learning_rate": 1.788125e-06, "loss": -0.003, "num_tokens": 45146580.0, "reward": 1.4059325456619263, "reward_std": 0.05845046043395996, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4059324264526367, "rewards/correct_reward_func/std": 0.14846326410770416, "step": 350 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1354.6309814453125, "completions/mean_terminated_length": 1354.6309814453125, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 0.5467289719626168, "grad_norm": 0.6357160210609436, "kl": 0.045284371823072433, "learning_rate": 1.7875e-06, "loss": -0.0187, "num_tokens": 45266273.0, "reward": 1.4549281597137451, "reward_std": 0.07358434051275253, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45492807030677795, "rewards/correct_reward_func/std": 0.12501084804534912, "step": 351 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1370.2261962890625, "completions/mean_terminated_length": 1370.2261962890625, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "epoch": 0.5482866043613707, "grad_norm": 0.6501032114028931, "kl": 0.04532886669039726, "learning_rate": 1.786875e-06, "loss": 0.0287, "num_tokens": 45387228.0, "reward": 1.509010910987854, "reward_std": 0.09563028067350388, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5328204035758972, "rewards/correct_reward_func/std": 0.1301979273557663, "step": 352 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2229.0, "completions/mean_length": 1504.21435546875, "completions/mean_terminated_length": 1423.6385498046875, "completions/min_length": 701.0, "completions/min_terminated_length": 701.0, "epoch": 0.5498442367601246, "grad_norm": 0.614535391330719, "kl": 0.04563060216605663, "learning_rate": 1.7862499999999998e-06, "loss": 0.0562, "num_tokens": 45519630.0, "reward": 1.5245028734207153, "reward_std": 0.09191560745239258, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5245028138160706, "rewards/correct_reward_func/std": 0.19058886170387268, "step": 353 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 1371.3690185546875, "completions/mean_terminated_length": 1289.1927490234375, "completions/min_length": 747.0, "completions/min_terminated_length": 747.0, "epoch": 0.5514018691588785, "grad_norm": 0.5684062242507935, "kl": 0.04687408730387688, "learning_rate": 1.785625e-06, "loss": 0.1009, "num_tokens": 45640585.0, "reward": 1.423880696296692, "reward_std": 0.08402802050113678, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43578535318374634, "rewards/correct_reward_func/std": 0.14543381333351135, "step": 354 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2220.0, "completions/max_terminated_length": 2220.0, "completions/mean_length": 1358.0238037109375, "completions/mean_terminated_length": 1358.0238037109375, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "epoch": 0.5529595015576324, "grad_norm": 0.6448848843574524, "kl": 0.048309145495295525, "learning_rate": 1.7849999999999999e-06, "loss": -0.0182, "num_tokens": 45760533.0, "reward": 1.4921796321868896, "reward_std": 0.07159780710935593, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4921795129776001, "rewards/correct_reward_func/std": 0.15320508182048798, "step": 355 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2087.0, "completions/max_terminated_length": 2087.0, "completions/mean_length": 1354.71435546875, "completions/mean_terminated_length": 1354.71435546875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "epoch": 0.5545171339563862, "grad_norm": 0.6176822185516357, "kl": 0.04802674613893032, "learning_rate": 1.784375e-06, "loss": -0.0059, "num_tokens": 45880269.0, "reward": 1.4876474142074585, "reward_std": 0.09487791359424591, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4995521306991577, "rewards/correct_reward_func/std": 0.13108977675437927, "step": 356 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2128.0, "completions/max_terminated_length": 2128.0, "completions/mean_length": 1344.9285888671875, "completions/mean_terminated_length": 1344.9285888671875, "completions/min_length": 769.0, "completions/min_terminated_length": 769.0, "epoch": 0.5560747663551402, "grad_norm": 0.6490213871002197, "kl": 0.0472539346665144, "learning_rate": 1.7837499999999999e-06, "loss": 0.0013, "num_tokens": 45999249.0, "reward": 1.44069242477417, "reward_std": 0.11679985374212265, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.46450188755989075, "rewards/correct_reward_func/std": 0.13278451561927795, "step": 357 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2629.0, "completions/max_terminated_length": 2629.0, "completions/mean_length": 1377.7857666015625, "completions/mean_terminated_length": 1377.7857666015625, "completions/min_length": 887.0, "completions/min_terminated_length": 887.0, "epoch": 0.557632398753894, "grad_norm": 0.6063095331192017, "kl": 0.045867305248975754, "learning_rate": 1.783125e-06, "loss": -0.0135, "num_tokens": 46121055.0, "reward": 1.4912810325622559, "reward_std": 0.07250796258449554, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49128106236457825, "rewards/correct_reward_func/std": 0.12492024898529053, "step": 358 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2068.0, "completions/max_terminated_length": 2068.0, "completions/mean_length": 1362.761962890625, "completions/mean_terminated_length": 1362.761962890625, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 0.559190031152648, "grad_norm": 0.6036370992660522, "kl": 0.047231562435626984, "learning_rate": 1.7824999999999999e-06, "loss": -0.0009, "num_tokens": 46241479.0, "reward": 1.4975894689559937, "reward_std": 0.05849050357937813, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4975893795490265, "rewards/correct_reward_func/std": 0.18169310688972473, "step": 359 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1936.0, "completions/max_terminated_length": 1936.0, "completions/mean_length": 1369.9881591796875, "completions/mean_terminated_length": 1369.9881591796875, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.5607476635514018, "grad_norm": 0.582613468170166, "kl": 0.04558840952813625, "learning_rate": 1.781875e-06, "loss": 0.0201, "num_tokens": 46362546.0, "reward": 1.4481348991394043, "reward_std": 0.09183409065008163, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46003949642181396, "rewards/correct_reward_func/std": 0.13477934896945953, "step": 360 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2250.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 1380.261962890625, "completions/mean_terminated_length": 1380.261962890625, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "epoch": 0.5623052959501558, "grad_norm": 0.6290069818496704, "kl": 0.04558514803647995, "learning_rate": 1.7812499999999999e-06, "loss": 0.0065, "num_tokens": 46484542.0, "reward": 1.4386088848114014, "reward_std": 0.09025963395833969, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4505135715007782, "rewards/correct_reward_func/std": 0.12712764739990234, "step": 361 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2111.0, "completions/max_terminated_length": 2111.0, "completions/mean_length": 1388.9881591796875, "completions/mean_terminated_length": 1388.9881591796875, "completions/min_length": 856.0, "completions/min_terminated_length": 856.0, "epoch": 0.5638629283489096, "grad_norm": 0.6121560335159302, "kl": 0.046877965331077576, "learning_rate": 1.7806249999999998e-06, "loss": -0.0113, "num_tokens": 46607247.0, "reward": 1.4698020219802856, "reward_std": 0.09116669744253159, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48170679807662964, "rewards/correct_reward_func/std": 0.1016574278473854, "step": 362 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1318.9881591796875, "completions/mean_terminated_length": 1318.9881591796875, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "epoch": 0.5654205607476636, "grad_norm": 0.6227669715881348, "kl": 0.046381985768675804, "learning_rate": 1.78e-06, "loss": -0.0136, "num_tokens": 46724030.0, "reward": 1.4805115461349487, "reward_std": 0.1278102546930313, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5043209791183472, "rewards/correct_reward_func/std": 0.19428442418575287, "step": 363 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2555.0, "completions/max_terminated_length": 2555.0, "completions/mean_length": 1378.107177734375, "completions/mean_terminated_length": 1378.107177734375, "completions/min_length": 933.0, "completions/min_terminated_length": 933.0, "epoch": 0.5669781931464174, "grad_norm": 0.6208792328834534, "kl": 0.048077501356601715, "learning_rate": 1.7793749999999998e-06, "loss": 0.015, "num_tokens": 46845689.0, "reward": 1.4393149614334106, "reward_std": 0.06981474906206131, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43931499123573303, "rewards/correct_reward_func/std": 0.13650043308734894, "step": 364 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1724.0, "completions/mean_length": 1391.65478515625, "completions/mean_terminated_length": 1309.722900390625, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.5685358255451713, "grad_norm": 0.624458909034729, "kl": 0.04737947881221771, "learning_rate": 1.77875e-06, "loss": 0.0523, "num_tokens": 46968504.0, "reward": 1.3942302465438843, "reward_std": 0.11320418864488602, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.41803956031799316, "rewards/correct_reward_func/std": 0.14469270408153534, "step": 365 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 1453.6905517578125, "completions/mean_terminated_length": 1372.5059814453125, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "epoch": 0.5700934579439252, "grad_norm": 0.5907891988754272, "kl": 0.04476970434188843, "learning_rate": 1.7781249999999998e-06, "loss": 0.0414, "num_tokens": 47096638.0, "reward": 1.4558826684951782, "reward_std": 0.08813583105802536, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4558826684951782, "rewards/correct_reward_func/std": 0.1586223989725113, "step": 366 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 1346.416748046875, "completions/mean_terminated_length": 1346.416748046875, "completions/min_length": 661.0, "completions/min_terminated_length": 661.0, "epoch": 0.5716510903426791, "grad_norm": 0.6247698068618774, "kl": 0.04782709293067455, "learning_rate": 1.7775e-06, "loss": 0.0158, "num_tokens": 47215659.0, "reward": 1.4506251811981201, "reward_std": 0.061412323266267776, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4506250023841858, "rewards/correct_reward_func/std": 0.1287914216518402, "step": 367 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2000.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 1385.511962890625, "completions/mean_terminated_length": 1385.511962890625, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 0.573208722741433, "grad_norm": 0.5953041911125183, "kl": 0.04662996344268322, "learning_rate": 1.7768749999999998e-06, "loss": -0.0291, "num_tokens": 47338246.0, "reward": 1.61995267868042, "reward_std": 0.071071557700634, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.6199524998664856, "rewards/correct_reward_func/std": 0.15339720249176025, "step": 368 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1815.0, "completions/max_terminated_length": 1815.0, "completions/mean_length": 1350.6905517578125, "completions/mean_terminated_length": 1350.6905517578125, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.5747663551401869, "grad_norm": 0.596235990524292, "kl": 0.047611601650714874, "learning_rate": 1.77625e-06, "loss": 0.0142, "num_tokens": 47457746.0, "reward": 1.4904042482376099, "reward_std": 0.0895150825381279, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5023089647293091, "rewards/correct_reward_func/std": 0.10198992490768433, "step": 369 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 1335.142822265625, "completions/mean_terminated_length": 1335.142822265625, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 0.5763239875389408, "grad_norm": 0.6384495496749878, "kl": 0.04769720509648323, "learning_rate": 1.7756249999999998e-06, "loss": 0.0225, "num_tokens": 47575748.0, "reward": 1.4142802953720093, "reward_std": 0.08743462711572647, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4261849820613861, "rewards/correct_reward_func/std": 0.12853728234767914, "step": 370 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 1369.0595703125, "completions/mean_terminated_length": 1369.0595703125, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 0.5778816199376947, "grad_norm": 0.5966677069664001, "kl": 0.04758315160870552, "learning_rate": 1.7749999999999997e-06, "loss": 0.0182, "num_tokens": 47696935.0, "reward": 1.4828351736068726, "reward_std": 0.06344291567802429, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.482835054397583, "rewards/correct_reward_func/std": 0.13246676325798035, "step": 371 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1849.0, "completions/max_terminated_length": 1849.0, "completions/mean_length": 1291.5714111328125, "completions/mean_terminated_length": 1291.5714111328125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.5794392523364486, "grad_norm": 0.6525982618331909, "kl": 0.049696190282702446, "learning_rate": 1.774375e-06, "loss": -0.0193, "num_tokens": 47811241.0, "reward": 1.4084582328796387, "reward_std": 0.06577665358781815, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.40845808386802673, "rewards/correct_reward_func/std": 0.11660967767238617, "step": 372 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2206.0, "completions/mean_length": 1518.5595703125, "completions/mean_terminated_length": 1438.1566162109375, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "epoch": 0.5809968847352025, "grad_norm": 0.5672593116760254, "kl": 0.04501592554152012, "learning_rate": 1.77375e-06, "loss": 0.0895, "num_tokens": 47944788.0, "reward": 1.5155577659606934, "reward_std": 0.06096799299120903, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5155577659606934, "rewards/correct_reward_func/std": 0.16944406926631927, "step": 373 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2131.0, "completions/max_terminated_length": 2131.0, "completions/mean_length": 1468.6309814453125, "completions/mean_terminated_length": 1468.6309814453125, "completions/min_length": 833.0, "completions/min_terminated_length": 833.0, "epoch": 0.5825545171339563, "grad_norm": 0.5926371216773987, "kl": 0.04884720593690872, "learning_rate": 1.773125e-06, "loss": 0.0291, "num_tokens": 48074309.0, "reward": 1.5413291454315186, "reward_std": 0.074510857462883, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5413291454315186, "rewards/correct_reward_func/std": 0.15059438347816467, "step": 374 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1955.0, "completions/max_terminated_length": 1955.0, "completions/mean_length": 1333.75, "completions/mean_terminated_length": 1333.75, "completions/min_length": 892.0, "completions/min_terminated_length": 892.0, "epoch": 0.5841121495327103, "grad_norm": 0.6363287568092346, "kl": 0.04845425486564636, "learning_rate": 1.7725e-06, "loss": -0.0013, "num_tokens": 48192344.0, "reward": 1.4435120820999146, "reward_std": 0.09547659754753113, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.455416738986969, "rewards/correct_reward_func/std": 0.17547385394573212, "step": 375 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1829.0, "completions/max_terminated_length": 1829.0, "completions/mean_length": 1348.09521484375, "completions/mean_terminated_length": 1348.09521484375, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.5856697819314641, "grad_norm": 0.6347678303718567, "kl": 0.04872422479093075, "learning_rate": 1.771875e-06, "loss": -0.0063, "num_tokens": 48311446.0, "reward": 1.532442569732666, "reward_std": 0.08060499280691147, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5324423909187317, "rewards/correct_reward_func/std": 0.1389501988887787, "step": 376 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2040.0, "completions/max_terminated_length": 2040.0, "completions/mean_length": 1335.6905517578125, "completions/mean_terminated_length": 1335.6905517578125, "completions/min_length": 39.0, "completions/min_terminated_length": 39.0, "epoch": 0.5872274143302181, "grad_norm": 0.6710498929023743, "kl": 0.04991703853011131, "learning_rate": 1.77125e-06, "loss": -0.0097, "num_tokens": 48429530.0, "reward": 1.4852207899093628, "reward_std": 0.11480290442705154, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5090302228927612, "rewards/correct_reward_func/std": 0.18953540921211243, "step": 377 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2115.0, "completions/max_terminated_length": 2115.0, "completions/mean_length": 1367.1190185546875, "completions/mean_terminated_length": 1367.1190185546875, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 0.5887850467289719, "grad_norm": 0.5975840091705322, "kl": 0.04870462976396084, "learning_rate": 1.7706249999999999e-06, "loss": 0.0098, "num_tokens": 48550482.0, "reward": 1.4639222621917725, "reward_std": 0.09818486869335175, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47582679986953735, "rewards/correct_reward_func/std": 0.15385620296001434, "step": 378 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2593.0, "completions/max_terminated_length": 2593.0, "completions/mean_length": 1391.6190185546875, "completions/mean_terminated_length": 1391.6190185546875, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "epoch": 0.5903426791277259, "grad_norm": 0.5850672721862793, "kl": 0.04981931112706661, "learning_rate": 1.77e-06, "loss": -0.0048, "num_tokens": 48673492.0, "reward": 1.4705395698547363, "reward_std": 0.07982930541038513, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4705394506454468, "rewards/correct_reward_func/std": 0.1553632766008377, "step": 379 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1856.0, "completions/max_terminated_length": 1856.0, "completions/mean_length": 1282.952392578125, "completions/mean_terminated_length": 1282.952392578125, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 0.5919003115264797, "grad_norm": 0.6228066682815552, "kl": 0.048284122720360756, "learning_rate": 1.769375e-06, "loss": -0.0227, "num_tokens": 48786996.0, "reward": 1.436045527458191, "reward_std": 0.05097164586186409, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4360455572605133, "rewards/correct_reward_func/std": 0.12165073305368423, "step": 380 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2251.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 1336.7738037109375, "completions/mean_terminated_length": 1336.7738037109375, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "epoch": 0.5934579439252337, "grad_norm": 0.6182482838630676, "kl": 0.047869689762592316, "learning_rate": 1.76875e-06, "loss": -0.0094, "num_tokens": 48905447.0, "reward": 1.4301729202270508, "reward_std": 0.08307760953903198, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44207748770713806, "rewards/correct_reward_func/std": 0.16730858385562897, "step": 381 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2161.0, "completions/max_terminated_length": 2161.0, "completions/mean_length": 1325.9881591796875, "completions/mean_terminated_length": 1325.9881591796875, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 0.5950155763239875, "grad_norm": 0.6429280638694763, "kl": 0.0489403922110796, "learning_rate": 1.768125e-06, "loss": -0.0063, "num_tokens": 49022764.0, "reward": 1.4742978811264038, "reward_std": 0.04766622185707092, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47429779171943665, "rewards/correct_reward_func/std": 0.11626514792442322, "step": 382 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1350.09521484375, "completions/mean_terminated_length": 1350.09521484375, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "epoch": 0.5965732087227414, "grad_norm": 0.6260592341423035, "kl": 0.05001649633049965, "learning_rate": 1.7675e-06, "loss": 0.0078, "num_tokens": 49142112.0, "reward": 1.5085965394973755, "reward_std": 0.06493545323610306, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5085963606834412, "rewards/correct_reward_func/std": 0.12024178355932236, "step": 383 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2039.0, "completions/max_terminated_length": 2039.0, "completions/mean_length": 1421.166748046875, "completions/mean_terminated_length": 1421.166748046875, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.5981308411214953, "grad_norm": 0.577016294002533, "kl": 0.05048423446714878, "learning_rate": 1.766875e-06, "loss": 0.0171, "num_tokens": 49267316.0, "reward": 1.4990577697753906, "reward_std": 0.10350355505943298, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5109624266624451, "rewards/correct_reward_func/std": 0.17975008487701416, "step": 384 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2037.0, "completions/max_terminated_length": 2037.0, "completions/mean_length": 1388.6190185546875, "completions/mean_terminated_length": 1388.6190185546875, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "epoch": 0.5996884735202492, "grad_norm": 0.6296937465667725, "kl": 0.048955587670207024, "learning_rate": 1.76625e-06, "loss": 0.0216, "num_tokens": 49390212.0, "reward": 1.4979599714279175, "reward_std": 0.09178230166435242, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5098646283149719, "rewards/correct_reward_func/std": 0.16648715734481812, "step": 385 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 1338.8214111328125, "completions/mean_terminated_length": 1338.8214111328125, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.6012461059190031, "grad_norm": 0.5855390429496765, "kl": 0.04758539795875549, "learning_rate": 1.765625e-06, "loss": -0.0172, "num_tokens": 49508787.0, "reward": 1.4601309299468994, "reward_std": 0.04890606552362442, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46013087034225464, "rewards/correct_reward_func/std": 0.1410851925611496, "step": 386 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1426.46435546875, "completions/mean_terminated_length": 1344.9517822265625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "epoch": 0.602803738317757, "grad_norm": 0.5801010131835938, "kl": 0.04862123541533947, "learning_rate": 1.7649999999999998e-06, "loss": 0.033, "num_tokens": 49634694.0, "reward": 1.5094380378723145, "reward_std": 0.07422788441181183, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5094379186630249, "rewards/correct_reward_func/std": 0.18413080275058746, "step": 387 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1957.0, "completions/max_terminated_length": 1957.0, "completions/mean_length": 1389.4761962890625, "completions/mean_terminated_length": 1389.4761962890625, "completions/min_length": 806.0, "completions/min_terminated_length": 806.0, "epoch": 0.6043613707165109, "grad_norm": 0.6577509641647339, "kl": 0.050652796402573586, "learning_rate": 1.764375e-06, "loss": 0.0029, "num_tokens": 49757398.0, "reward": 1.5519939661026, "reward_std": 0.06602007895708084, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5519937872886658, "rewards/correct_reward_func/std": 0.12210499495267868, "step": 388 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1984.0, "completions/max_terminated_length": 1984.0, "completions/mean_length": 1296.1785888671875, "completions/mean_terminated_length": 1296.1785888671875, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 0.6059190031152648, "grad_norm": 0.6384318470954895, "kl": 0.04982003942131996, "learning_rate": 1.7637499999999998e-06, "loss": -0.0317, "num_tokens": 49872283.0, "reward": 1.474208116531372, "reward_std": 0.056270867586135864, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4742080271244049, "rewards/correct_reward_func/std": 0.1628669947385788, "step": 389 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2054.0, "completions/max_terminated_length": 2054.0, "completions/mean_length": 1370.7738037109375, "completions/mean_terminated_length": 1370.7738037109375, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 0.6074766355140186, "grad_norm": 0.6070489287376404, "kl": 0.04989171586930752, "learning_rate": 1.763125e-06, "loss": 0.0072, "num_tokens": 49993458.0, "reward": 1.4242897033691406, "reward_std": 0.12058194726705551, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4361944794654846, "rewards/correct_reward_func/std": 0.15444742143154144, "step": 390 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1930.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 1269.6190185546875, "completions/mean_terminated_length": 1269.6190185546875, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "epoch": 0.6090342679127726, "grad_norm": 0.6277110576629639, "kl": 0.05084827356040478, "learning_rate": 1.7624999999999999e-06, "loss": -0.0106, "num_tokens": 50105788.0, "reward": 1.4761323928833008, "reward_std": 0.08773455768823624, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47613224387168884, "rewards/correct_reward_func/std": 0.18977254629135132, "step": 391 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2028.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1340.547607421875, "completions/mean_terminated_length": 1340.547607421875, "completions/min_length": 660.0, "completions/min_terminated_length": 660.0, "epoch": 0.6105919003115264, "grad_norm": 0.6418157815933228, "kl": 0.05028718709945679, "learning_rate": 1.761875e-06, "loss": 0.0052, "num_tokens": 50224304.0, "reward": 1.5189129114151, "reward_std": 0.06859312951564789, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5189128518104553, "rewards/correct_reward_func/std": 0.13187937438488007, "step": 392 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 1348.2857666015625, "completions/mean_terminated_length": 1348.2857666015625, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 0.6121495327102804, "grad_norm": 0.6196921467781067, "kl": 0.05028197728097439, "learning_rate": 1.7612499999999999e-06, "loss": -0.0127, "num_tokens": 50343602.0, "reward": 1.4608813524246216, "reward_std": 0.0647798702120781, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4608812928199768, "rewards/correct_reward_func/std": 0.10814743489027023, "step": 393 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 1874.0, "completions/mean_length": 1432.761962890625, "completions/mean_terminated_length": 1351.3251953125, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "epoch": 0.6137071651090342, "grad_norm": 0.5915994048118591, "kl": 0.04747145250439644, "learning_rate": 1.760625e-06, "loss": 0.0774, "num_tokens": 50470044.0, "reward": 1.4953646659851074, "reward_std": 0.06458833068609238, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49536460638046265, "rewards/correct_reward_func/std": 0.15716253221035004, "step": 394 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2262.0, "completions/max_terminated_length": 2262.0, "completions/mean_length": 1341.4761962890625, "completions/mean_terminated_length": 1341.4761962890625, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "epoch": 0.6152647975077882, "grad_norm": 0.6374837756156921, "kl": 0.05029851756989956, "learning_rate": 1.7599999999999999e-06, "loss": -0.0057, "num_tokens": 50588620.0, "reward": 1.4544249773025513, "reward_std": 0.06668942421674728, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4544249176979065, "rewards/correct_reward_func/std": 0.1269298493862152, "step": 395 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2396.0, "completions/max_terminated_length": 2396.0, "completions/mean_length": 1336.6785888671875, "completions/mean_terminated_length": 1336.6785888671875, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.616822429906542, "grad_norm": 0.6182777881622314, "kl": 0.04916258528828621, "learning_rate": 1.7593749999999998e-06, "loss": -0.0244, "num_tokens": 50706799.0, "reward": 1.5116595029830933, "reward_std": 0.056161068379879, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.511659562587738, "rewards/correct_reward_func/std": 0.17195159196853638, "step": 396 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2482.0, "completions/max_terminated_length": 2482.0, "completions/mean_length": 1375.8690185546875, "completions/mean_terminated_length": 1375.8690185546875, "completions/min_length": 937.0, "completions/min_terminated_length": 937.0, "epoch": 0.618380062305296, "grad_norm": 0.60200434923172, "kl": 0.049690814688801765, "learning_rate": 1.7587499999999999e-06, "loss": -0.0061, "num_tokens": 50828270.0, "reward": 1.492004156112671, "reward_std": 0.07065374404191971, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4920039772987366, "rewards/correct_reward_func/std": 0.14335261285305023, "step": 397 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2264.0, "completions/mean_length": 1516.9761962890625, "completions/mean_terminated_length": 1436.55419921875, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 0.6199376947040498, "grad_norm": 0.587050199508667, "kl": 0.048227181658148766, "learning_rate": 1.7581249999999998e-06, "loss": 0.0757, "num_tokens": 50961774.0, "reward": 1.5450599193572998, "reward_std": 0.09599590301513672, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.556964635848999, "rewards/correct_reward_func/std": 0.1692476123571396, "step": 398 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2190.0, "completions/max_terminated_length": 2190.0, "completions/mean_length": 1380.5357666015625, "completions/mean_terminated_length": 1380.5357666015625, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "epoch": 0.6214953271028038, "grad_norm": 0.550334095954895, "kl": 0.04830704443156719, "learning_rate": 1.7575e-06, "loss": -0.0391, "num_tokens": 51083847.0, "reward": 1.4870414733886719, "reward_std": 0.06050838157534599, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48704153299331665, "rewards/correct_reward_func/std": 0.13304997980594635, "step": 399 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2251.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 1446.96435546875, "completions/mean_terminated_length": 1446.96435546875, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.6230529595015576, "grad_norm": 0.6399713754653931, "kl": 0.04986717738211155, "learning_rate": 1.7568749999999998e-06, "loss": -0.0089, "num_tokens": 51211506.0, "reward": 1.505528211593628, "reward_std": 0.057832684367895126, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5055281519889832, "rewards/correct_reward_func/std": 0.18946535885334015, "step": 400 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2223.0, "completions/max_terminated_length": 2223.0, "completions/mean_length": 1390.4285888671875, "completions/mean_terminated_length": 1390.4285888671875, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.6246105919003115, "grad_norm": 0.6153919100761414, "kl": 0.048483846709132195, "learning_rate": 1.75625e-06, "loss": 0.0004, "num_tokens": 51334536.0, "reward": 1.5144027471542358, "reward_std": 0.046569447964429855, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5144026875495911, "rewards/correct_reward_func/std": 0.10261337459087372, "step": 401 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2250.0, "completions/mean_length": 1555.4761962890625, "completions/mean_terminated_length": 1475.51806640625, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 0.6261682242990654, "grad_norm": 0.5729800462722778, "kl": 0.04777614213526249, "learning_rate": 1.7556249999999998e-06, "loss": 0.0621, "num_tokens": 51471334.0, "reward": 1.477668285369873, "reward_std": 0.08024942129850388, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47766822576522827, "rewards/correct_reward_func/std": 0.12405380606651306, "step": 402 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2004.0, "completions/max_terminated_length": 2004.0, "completions/mean_length": 1394.0714111328125, "completions/mean_terminated_length": 1394.0714111328125, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "epoch": 0.6277258566978193, "grad_norm": 0.6006665229797363, "kl": 0.048750247806310654, "learning_rate": 1.7549999999999997e-06, "loss": 0.0141, "num_tokens": 51594508.0, "reward": 1.5268176794052124, "reward_std": 0.06284648180007935, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5268176794052124, "rewards/correct_reward_func/std": 0.1302812248468399, "step": 403 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2047.0, "completions/max_terminated_length": 2047.0, "completions/mean_length": 1379.416748046875, "completions/mean_terminated_length": 1379.416748046875, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.6292834890965732, "grad_norm": 0.6163754463195801, "kl": 0.0525053720921278, "learning_rate": 1.754375e-06, "loss": -0.0313, "num_tokens": 51716115.0, "reward": 1.5108131170272827, "reward_std": 0.09598790854215622, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5108129382133484, "rewards/correct_reward_func/std": 0.17914439737796783, "step": 404 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2625.0, "completions/max_terminated_length": 2625.0, "completions/mean_length": 1451.5595703125, "completions/mean_terminated_length": 1451.5595703125, "completions/min_length": 506.0, "completions/min_terminated_length": 506.0, "epoch": 0.6308411214953271, "grad_norm": 0.6257968544960022, "kl": 0.0501710157841444, "learning_rate": 1.75375e-06, "loss": -0.0006, "num_tokens": 51843950.0, "reward": 1.4702588319778442, "reward_std": 0.08658844977617264, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48216357827186584, "rewards/correct_reward_func/std": 0.13416936993598938, "step": 405 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2347.0, "completions/max_terminated_length": 2347.0, "completions/mean_length": 1460.0, "completions/mean_terminated_length": 1460.0, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.632398753894081, "grad_norm": 0.5904620885848999, "kl": 0.049389807507395744, "learning_rate": 1.753125e-06, "loss": -0.0261, "num_tokens": 51972656.0, "reward": 1.5546208620071411, "reward_std": 0.06326793879270554, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5546208024024963, "rewards/correct_reward_func/std": 0.1749102622270584, "step": 406 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3438.0, "completions/max_terminated_length": 3438.0, "completions/mean_length": 1483.4761962890625, "completions/mean_terminated_length": 1483.4761962890625, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 0.6339563862928349, "grad_norm": 0.5588891506195068, "kl": 0.04781218431890011, "learning_rate": 1.7525e-06, "loss": 0.0121, "num_tokens": 52103292.0, "reward": 1.540654182434082, "reward_std": 0.06845831125974655, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5406539440155029, "rewards/correct_reward_func/std": 0.22292810678482056, "step": 407 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2493.0, "completions/max_terminated_length": 2493.0, "completions/mean_length": 1564.34521484375, "completions/mean_terminated_length": 1564.34521484375, "completions/min_length": 724.0, "completions/min_terminated_length": 724.0, "epoch": 0.6355140186915887, "grad_norm": 0.568066418170929, "kl": 0.050099100917577744, "learning_rate": 1.751875e-06, "loss": 0.0297, "num_tokens": 52240961.0, "reward": 1.460665225982666, "reward_std": 0.04765839874744415, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46066510677337646, "rewards/correct_reward_func/std": 0.09499367326498032, "step": 408 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2255.0, "completions/max_terminated_length": 2255.0, "completions/mean_length": 1502.107177734375, "completions/mean_terminated_length": 1502.107177734375, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.6370716510903427, "grad_norm": 0.6036926507949829, "kl": 0.049950817599892616, "learning_rate": 1.75125e-06, "loss": -0.0168, "num_tokens": 52373144.0, "reward": 1.4840716123580933, "reward_std": 0.09531796723604202, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4959762990474701, "rewards/correct_reward_func/std": 0.1371496170759201, "step": 409 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.04761904761904767, "completions/max_length": 8192.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 1858.1905517578125, "completions/mean_terminated_length": 1541.5, "completions/min_length": 838.0, "completions/min_terminated_length": 838.0, "epoch": 0.6386292834890965, "grad_norm": 0.498012512922287, "kl": 0.04354623891413212, "learning_rate": 1.750625e-06, "loss": 0.2021, "num_tokens": 52535262.0, "reward": 1.4536290168762207, "reward_std": 0.09940145164728165, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4536289870738983, "rewards/correct_reward_func/std": 0.18348245322704315, "step": 410 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3503.0, "completions/max_terminated_length": 3503.0, "completions/mean_length": 1549.952392578125, "completions/mean_terminated_length": 1549.952392578125, "completions/min_length": 981.0, "completions/min_terminated_length": 981.0, "epoch": 0.6401869158878505, "grad_norm": 0.5648190379142761, "kl": 0.048112260177731514, "learning_rate": 1.75e-06, "loss": 0.004, "num_tokens": 52671566.0, "reward": 1.4433013200759888, "reward_std": 0.09281626343727112, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45520591735839844, "rewards/correct_reward_func/std": 0.1479065865278244, "step": 411 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 1474.416748046875, "completions/mean_terminated_length": 1474.416748046875, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 0.6417445482866043, "grad_norm": 0.5981578826904297, "kl": 0.05129780061542988, "learning_rate": 1.7493749999999999e-06, "loss": 0.0015, "num_tokens": 52801327.0, "reward": 1.5297049283981323, "reward_std": 0.07844500243663788, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5297048687934875, "rewards/correct_reward_func/std": 0.17416192591190338, "step": 412 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2826.0, "completions/max_terminated_length": 2826.0, "completions/mean_length": 1552.8214111328125, "completions/mean_terminated_length": 1552.8214111328125, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 0.6433021806853583, "grad_norm": 0.551489531993866, "kl": 0.04923750273883343, "learning_rate": 1.74875e-06, "loss": -0.005, "num_tokens": 52937830.0, "reward": 1.544388771057129, "reward_std": 0.07821746915578842, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5562934875488281, "rewards/correct_reward_func/std": 0.155470609664917, "step": 413 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2580.0, "completions/max_terminated_length": 2580.0, "completions/mean_length": 1587.2857666015625, "completions/mean_terminated_length": 1587.2857666015625, "completions/min_length": 1037.0, "completions/min_terminated_length": 1037.0, "epoch": 0.6448598130841121, "grad_norm": 0.5425035953521729, "kl": 0.049748532474040985, "learning_rate": 1.7481249999999999e-06, "loss": -0.0384, "num_tokens": 53077198.0, "reward": 1.522072434425354, "reward_std": 0.10684633255004883, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5339770317077637, "rewards/correct_reward_func/std": 0.17637047171592712, "step": 414 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2465.0, "completions/mean_length": 1585.9405517578125, "completions/mean_terminated_length": 1506.349365234375, "completions/min_length": 975.0, "completions/min_terminated_length": 975.0, "epoch": 0.6464174454828661, "grad_norm": 0.5822159647941589, "kl": 0.04930712282657623, "learning_rate": 1.7475e-06, "loss": 0.0505, "num_tokens": 53216147.0, "reward": 1.520836591720581, "reward_std": 0.09006838500499725, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5208365321159363, "rewards/correct_reward_func/std": 0.19083261489868164, "step": 415 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2523.0, "completions/mean_length": 1632.4405517578125, "completions/mean_terminated_length": 1553.4095458984375, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "epoch": 0.6479750778816199, "grad_norm": 0.5585830211639404, "kl": 0.04699916951358318, "learning_rate": 1.746875e-06, "loss": -0.0579, "num_tokens": 53359182.0, "reward": 1.4239436388015747, "reward_std": 0.05681487172842026, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42394357919692993, "rewards/correct_reward_func/std": 0.12871341407299042, "step": 416 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3221.0, "completions/max_terminated_length": 3221.0, "completions/mean_length": 1636.5357666015625, "completions/mean_terminated_length": 1636.5357666015625, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.6495327102803738, "grad_norm": 0.6002727150917053, "kl": 0.05007455497980118, "learning_rate": 1.74625e-06, "loss": 0.0327, "num_tokens": 53502591.0, "reward": 1.4320467710494995, "reward_std": 0.05887793377041817, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4320466220378876, "rewards/correct_reward_func/std": 0.11791915446519852, "step": 417 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 1614.452392578125, "completions/mean_terminated_length": 1535.2047119140625, "completions/min_length": 796.0, "completions/min_terminated_length": 796.0, "epoch": 0.6510903426791277, "grad_norm": 0.601240873336792, "kl": 0.051675185561180115, "learning_rate": 1.745625e-06, "loss": 0.0749, "num_tokens": 53644247.0, "reward": 1.486029028892517, "reward_std": 0.07044512033462524, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48602885007858276, "rewards/correct_reward_func/std": 0.1468798667192459, "step": 418 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3119.0, "completions/max_terminated_length": 3119.0, "completions/mean_length": 1591.75, "completions/mean_terminated_length": 1591.75, "completions/min_length": 1031.0, "completions/min_terminated_length": 1031.0, "epoch": 0.6526479750778816, "grad_norm": 0.6253805756568909, "kl": 0.051795635372400284, "learning_rate": 1.745e-06, "loss": -0.0031, "num_tokens": 53783720.0, "reward": 1.4498029947280884, "reward_std": 0.08419051766395569, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46170762181282043, "rewards/correct_reward_func/std": 0.17074517905712128, "step": 419 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2441.0, "completions/max_terminated_length": 2441.0, "completions/mean_length": 1521.8214111328125, "completions/mean_terminated_length": 1521.8214111328125, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 0.6542056074766355, "grad_norm": 0.6050965785980225, "kl": 0.05233046971261501, "learning_rate": 1.744375e-06, "loss": -0.019, "num_tokens": 53917445.0, "reward": 1.435978651046753, "reward_std": 0.12028548866510391, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.459788054227829, "rewards/correct_reward_func/std": 0.14839471876621246, "step": 420 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 1687.952392578125, "completions/mean_terminated_length": 1609.59033203125, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "epoch": 0.6557632398753894, "grad_norm": 0.542452871799469, "kl": 0.05065176263451576, "learning_rate": 1.7437499999999998e-06, "loss": 0.0578, "num_tokens": 54065257.0, "reward": 1.4500166177749634, "reward_std": 0.09093791991472244, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4500165581703186, "rewards/correct_reward_func/std": 0.16098248958587646, "step": 421 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2327.0, "completions/max_terminated_length": 2327.0, "completions/mean_length": 1550.5714111328125, "completions/mean_terminated_length": 1550.5714111328125, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "epoch": 0.6573208722741433, "grad_norm": 0.5730518102645874, "kl": 0.050289461389184, "learning_rate": 1.743125e-06, "loss": -0.0254, "num_tokens": 54201613.0, "reward": 1.4041987657546997, "reward_std": 0.046286944299936295, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4041987359523773, "rewards/correct_reward_func/std": 0.13344568014144897, "step": 422 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 1631.297607421875, "completions/mean_terminated_length": 1552.2529296875, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "epoch": 0.6588785046728972, "grad_norm": 0.5699371099472046, "kl": 0.05113396793603897, "learning_rate": 1.7424999999999998e-06, "loss": 0.0358, "num_tokens": 54344702.0, "reward": 1.4777201414108276, "reward_std": 0.07488631457090378, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47772011160850525, "rewards/correct_reward_func/std": 0.13865311443805695, "step": 423 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1677.59521484375, "completions/mean_terminated_length": 1599.1083984375, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 0.660436137071651, "grad_norm": 0.5397751927375793, "kl": 0.05114184692502022, "learning_rate": 1.741875e-06, "loss": -0.0164, "num_tokens": 54491662.0, "reward": 1.4992514848709106, "reward_std": 0.0635334923863411, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4992513060569763, "rewards/correct_reward_func/std": 0.1298021823167801, "step": 424 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2204.0, "completions/max_terminated_length": 2204.0, "completions/mean_length": 1552.1429443359375, "completions/mean_terminated_length": 1552.1429443359375, "completions/min_length": 1036.0, "completions/min_terminated_length": 1036.0, "epoch": 0.661993769470405, "grad_norm": 0.5741438269615173, "kl": 0.05266575887799263, "learning_rate": 1.7412499999999998e-06, "loss": 0.0127, "num_tokens": 54627778.0, "reward": 1.4440009593963623, "reward_std": 0.08349818736314774, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45590564608573914, "rewards/correct_reward_func/std": 0.17922662198543549, "step": 425 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2499.0, "completions/max_terminated_length": 2499.0, "completions/mean_length": 1625.3929443359375, "completions/mean_terminated_length": 1625.3929443359375, "completions/min_length": 981.0, "completions/min_terminated_length": 981.0, "epoch": 0.6635514018691588, "grad_norm": 0.5885212421417236, "kl": 0.05119376443326473, "learning_rate": 1.740625e-06, "loss": 0.0118, "num_tokens": 54770425.0, "reward": 1.473638653755188, "reward_std": 0.0765259712934494, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4736386239528656, "rewards/correct_reward_func/std": 0.10725454986095428, "step": 426 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 1690.8333740234375, "completions/mean_terminated_length": 1612.5059814453125, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "epoch": 0.6651090342679128, "grad_norm": 0.5927914381027222, "kl": 0.04985920339822769, "learning_rate": 1.7399999999999999e-06, "loss": 0.0531, "num_tokens": 54918389.0, "reward": 1.4642417430877686, "reward_std": 0.12046536058187485, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4761464297771454, "rewards/correct_reward_func/std": 0.1584354043006897, "step": 427 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2421.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1637.1190185546875, "completions/mean_terminated_length": 1637.1190185546875, "completions/min_length": 1066.0, "completions/min_terminated_length": 1066.0, "epoch": 0.6666666666666666, "grad_norm": 0.5880814790725708, "kl": 0.05253339186310768, "learning_rate": 1.7393749999999998e-06, "loss": -0.0262, "num_tokens": 55061943.0, "reward": 1.5044912099838257, "reward_std": 0.08261405676603317, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5044911503791809, "rewards/correct_reward_func/std": 0.17851826548576355, "step": 428 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 1644.8929443359375, "completions/mean_terminated_length": 1566.011962890625, "completions/min_length": 1051.0, "completions/min_terminated_length": 1051.0, "epoch": 0.6682242990654206, "grad_norm": 0.5974320769309998, "kl": 0.04985599033534527, "learning_rate": 1.7387499999999999e-06, "loss": 0.0317, "num_tokens": 55205916.0, "reward": 1.472242832183838, "reward_std": 0.13399535417556763, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4960523247718811, "rewards/correct_reward_func/std": 0.16596059501171112, "step": 429 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 1612.9881591796875, "completions/mean_terminated_length": 1612.9881591796875, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "epoch": 0.6697819314641744, "grad_norm": 0.5865140557289124, "kl": 0.05195549875497818, "learning_rate": 1.7381249999999998e-06, "loss": 0.0019, "num_tokens": 55347581.0, "reward": 1.4284037351608276, "reward_std": 0.09993235766887665, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.45221319794654846, "rewards/correct_reward_func/std": 0.11526290327310562, "step": 430 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2541.0, "completions/max_terminated_length": 2541.0, "completions/mean_length": 1535.5714111328125, "completions/mean_terminated_length": 1535.5714111328125, "completions/min_length": 992.0, "completions/min_terminated_length": 992.0, "epoch": 0.6713395638629284, "grad_norm": 0.5939237475395203, "kl": 0.05092081241309643, "learning_rate": 1.7374999999999999e-06, "loss": 0.0117, "num_tokens": 55482497.0, "reward": 1.5227851867675781, "reward_std": 0.09420502930879593, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5346897840499878, "rewards/correct_reward_func/std": 0.13495904207229614, "step": 431 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2354.0, "completions/max_terminated_length": 2354.0, "completions/mean_length": 1559.416748046875, "completions/mean_terminated_length": 1559.416748046875, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "epoch": 0.6728971962616822, "grad_norm": 0.5865123271942139, "kl": 0.05216217786073685, "learning_rate": 1.7368749999999998e-06, "loss": 0.0131, "num_tokens": 55619320.0, "reward": 1.5050667524337769, "reward_std": 0.08095559477806091, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5169714689254761, "rewards/correct_reward_func/std": 0.18940287828445435, "step": 432 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2694.0, "completions/max_terminated_length": 2694.0, "completions/mean_length": 1583.1429443359375, "completions/mean_terminated_length": 1583.1429443359375, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "epoch": 0.6744548286604362, "grad_norm": 0.6121124625205994, "kl": 0.04994286224246025, "learning_rate": 1.7362499999999999e-06, "loss": 0.0266, "num_tokens": 55758322.0, "reward": 1.4305202960968018, "reward_std": 0.0557343065738678, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43052029609680176, "rewards/correct_reward_func/std": 0.15990039706230164, "step": 433 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 4451.0, "completions/mean_length": 1662.0357666015625, "completions/mean_terminated_length": 1583.361328125, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.67601246105919, "grad_norm": 0.5868165493011475, "kl": 0.04885072074830532, "learning_rate": 1.7356249999999998e-06, "loss": 0.0522, "num_tokens": 55903879.0, "reward": 1.4234789609909058, "reward_std": 0.1545991748571396, "rewards/contains_chinese/mean": 0.9523809552192688, "rewards/contains_chinese/std": 0.21423791348934174, "rewards/correct_reward_func/mean": 0.47109803557395935, "rewards/correct_reward_func/std": 0.1350628137588501, "step": 434 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2375.0, "completions/max_terminated_length": 2375.0, "completions/mean_length": 1513.5714111328125, "completions/mean_terminated_length": 1513.5714111328125, "completions/min_length": 948.0, "completions/min_terminated_length": 948.0, "epoch": 0.677570093457944, "grad_norm": 0.5805670619010925, "kl": 0.052099065855145454, "learning_rate": 1.7350000000000001e-06, "loss": -0.0069, "num_tokens": 56036809.0, "reward": 1.4437384605407715, "reward_std": 0.05860109254717827, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44373825192451477, "rewards/correct_reward_func/std": 0.19147835671901703, "step": 435 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2622.0, "completions/mean_length": 1667.5238037109375, "completions/mean_terminated_length": 1588.9156494140625, "completions/min_length": 975.0, "completions/min_terminated_length": 975.0, "epoch": 0.6791277258566978, "grad_norm": 0.5827205181121826, "kl": 0.048958078026771545, "learning_rate": 1.734375e-06, "loss": 0.0677, "num_tokens": 56182893.0, "reward": 1.4653565883636475, "reward_std": 0.08377533406019211, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4653565585613251, "rewards/correct_reward_func/std": 0.17159578204154968, "step": 436 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2398.0, "completions/mean_length": 1661.6785888671875, "completions/mean_terminated_length": 1583.0, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 0.6806853582554517, "grad_norm": 0.5805427432060242, "kl": 0.04939436540007591, "learning_rate": 1.73375e-06, "loss": -0.0382, "num_tokens": 56328444.0, "reward": 1.443485140800476, "reward_std": 0.06904201209545135, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44348499178886414, "rewards/correct_reward_func/std": 0.1347956359386444, "step": 437 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2602.0, "completions/mean_length": 1691.107177734375, "completions/mean_terminated_length": 1612.7830810546875, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 0.6822429906542056, "grad_norm": 0.5515686273574829, "kl": 0.05067290551960468, "learning_rate": 1.733125e-06, "loss": 0.0507, "num_tokens": 56476701.0, "reward": 1.4295130968093872, "reward_std": 0.11515135318040848, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4533223509788513, "rewards/correct_reward_func/std": 0.16026785969734192, "step": 438 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2201.0, "completions/max_terminated_length": 2201.0, "completions/mean_length": 1521.6309814453125, "completions/mean_terminated_length": 1521.6309814453125, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.6838006230529595, "grad_norm": 0.5936988592147827, "kl": 0.052045663818717, "learning_rate": 1.7325e-06, "loss": 0.031, "num_tokens": 56610566.0, "reward": 1.4802929162979126, "reward_std": 0.08335726708173752, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.504102349281311, "rewards/correct_reward_func/std": 0.1541454792022705, "step": 439 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2006.0, "completions/max_terminated_length": 2006.0, "completions/mean_length": 1464.0595703125, "completions/mean_terminated_length": 1464.0595703125, "completions/min_length": 916.0, "completions/min_terminated_length": 916.0, "epoch": 0.6853582554517134, "grad_norm": 0.6046100854873657, "kl": 0.05066749081015587, "learning_rate": 1.731875e-06, "loss": -0.006, "num_tokens": 56739727.0, "reward": 1.4421093463897705, "reward_std": 0.05164036527276039, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4421093463897705, "rewards/correct_reward_func/std": 0.14187321066856384, "step": 440 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2175.0, "completions/mean_length": 1621.3929443359375, "completions/mean_terminated_length": 1461.134033203125, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.6869158878504673, "grad_norm": 0.5737205147743225, "kl": 0.04900176823139191, "learning_rate": 1.73125e-06, "loss": 0.0934, "num_tokens": 56881816.0, "reward": 1.42433500289917, "reward_std": 0.0748237892985344, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42433494329452515, "rewards/correct_reward_func/std": 0.18206322193145752, "step": 441 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2703.0, "completions/max_terminated_length": 2703.0, "completions/mean_length": 1491.047607421875, "completions/mean_terminated_length": 1491.047607421875, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "epoch": 0.6884735202492211, "grad_norm": 0.6053724884986877, "kl": 0.052103569731116295, "learning_rate": 1.730625e-06, "loss": -0.0204, "num_tokens": 57012968.0, "reward": 1.5510308742523193, "reward_std": 0.09038885682821274, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5629354119300842, "rewards/correct_reward_func/std": 0.22346119582653046, "step": 442 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2108.0, "completions/max_terminated_length": 2108.0, "completions/mean_length": 1477.84521484375, "completions/mean_terminated_length": 1477.84521484375, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 0.6900311526479751, "grad_norm": 0.5881515741348267, "kl": 0.05128224939107895, "learning_rate": 1.73e-06, "loss": -0.0017, "num_tokens": 57143023.0, "reward": 1.4883114099502563, "reward_std": 0.062143657356500626, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48831140995025635, "rewards/correct_reward_func/std": 0.13774645328521729, "step": 443 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2033.0, "completions/max_terminated_length": 2033.0, "completions/mean_length": 1422.7261962890625, "completions/mean_terminated_length": 1422.7261962890625, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.6915887850467289, "grad_norm": 0.6870121359825134, "kl": 0.0524381622672081, "learning_rate": 1.729375e-06, "loss": 0.0179, "num_tokens": 57268346.0, "reward": 1.4325767755508423, "reward_std": 0.11797544360160828, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4444815218448639, "rewards/correct_reward_func/std": 0.13778088986873627, "step": 444 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2291.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 1486.011962890625, "completions/mean_terminated_length": 1486.011962890625, "completions/min_length": 435.0, "completions/min_terminated_length": 435.0, "epoch": 0.6931464174454829, "grad_norm": 0.589530348777771, "kl": 0.04958914779126644, "learning_rate": 1.72875e-06, "loss": -0.0082, "num_tokens": 57399075.0, "reward": 1.4617598056793213, "reward_std": 0.0732613280415535, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4617597460746765, "rewards/correct_reward_func/std": 0.20478412508964539, "step": 445 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2343.0, "completions/max_terminated_length": 2343.0, "completions/mean_length": 1440.261962890625, "completions/mean_terminated_length": 1440.261962890625, "completions/min_length": 687.0, "completions/min_terminated_length": 687.0, "epoch": 0.6947040498442367, "grad_norm": 0.5789417624473572, "kl": 0.049345508217811584, "learning_rate": 1.7281249999999999e-06, "loss": -0.0048, "num_tokens": 57526129.0, "reward": 1.4764251708984375, "reward_std": 0.060231760144233704, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4764251708984375, "rewards/correct_reward_func/std": 0.15916836261749268, "step": 446 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1481.5, "completions/mean_terminated_length": 1481.5, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 0.6962616822429907, "grad_norm": 0.5849701762199402, "kl": 0.052394647151231766, "learning_rate": 1.7275e-06, "loss": -0.0097, "num_tokens": 57656521.0, "reward": 1.5263036489486694, "reward_std": 0.07463201880455017, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5263035893440247, "rewards/correct_reward_func/std": 0.17446979880332947, "step": 447 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3242.0, "completions/max_terminated_length": 3242.0, "completions/mean_length": 1495.3095703125, "completions/mean_terminated_length": 1495.3095703125, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.6978193146417445, "grad_norm": 0.628301203250885, "kl": 0.05113241821527481, "learning_rate": 1.7268749999999999e-06, "loss": 0.0129, "num_tokens": 57788163.0, "reward": 1.4820013046264648, "reward_std": 0.09342510253190994, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49390602111816406, "rewards/correct_reward_func/std": 0.12805330753326416, "step": 448 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2319.0, "completions/max_terminated_length": 2319.0, "completions/mean_length": 1449.011962890625, "completions/mean_terminated_length": 1449.011962890625, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 0.6993769470404985, "grad_norm": 0.626068651676178, "kl": 0.05249497666954994, "learning_rate": 1.72625e-06, "loss": 0.0168, "num_tokens": 57916012.0, "reward": 1.4972912073135376, "reward_std": 0.07907280325889587, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.509195864200592, "rewards/correct_reward_func/std": 0.15952186286449432, "step": 449 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1982.0, "completions/max_terminated_length": 1982.0, "completions/mean_length": 1396.7381591796875, "completions/mean_terminated_length": 1396.7381591796875, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 0.7009345794392523, "grad_norm": 0.5788159966468811, "kl": 0.0519126933068037, "learning_rate": 1.7256249999999999e-06, "loss": -0.0161, "num_tokens": 58039266.0, "reward": 1.5347965955734253, "reward_std": 0.05890589952468872, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5347966551780701, "rewards/correct_reward_func/std": 0.13743919134140015, "step": 450 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1962.0, "completions/max_terminated_length": 1962.0, "completions/mean_length": 1381.011962890625, "completions/mean_terminated_length": 1381.011962890625, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 0.7024922118380063, "grad_norm": 0.6056314706802368, "kl": 0.05103152059018612, "learning_rate": 1.725e-06, "loss": 0.0104, "num_tokens": 58161217.0, "reward": 1.479777455329895, "reward_std": 0.06452760100364685, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47977739572525024, "rewards/correct_reward_func/std": 0.1589631587266922, "step": 451 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1997.0, "completions/max_terminated_length": 1997.0, "completions/mean_length": 1404.9285888671875, "completions/mean_terminated_length": 1404.9285888671875, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "epoch": 0.7040498442367601, "grad_norm": 0.6056322455406189, "kl": 0.05121096037328243, "learning_rate": 1.724375e-06, "loss": 0.0433, "num_tokens": 58285201.0, "reward": 1.568735957145691, "reward_std": 0.0742240622639656, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5687359571456909, "rewards/correct_reward_func/std": 0.1560218781232834, "step": 452 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2313.0, "completions/mean_length": 1456.1429443359375, "completions/mean_terminated_length": 1374.9879150390625, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "epoch": 0.705607476635514, "grad_norm": 0.6087128520011902, "kl": 0.05143558606505394, "learning_rate": 1.7237499999999998e-06, "loss": 0.0035, "num_tokens": 58413421.0, "reward": 1.4904062747955322, "reward_std": 0.09947887063026428, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5023109912872314, "rewards/correct_reward_func/std": 0.18832674622535706, "step": 453 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2173.0, "completions/max_terminated_length": 2173.0, "completions/mean_length": 1527.4285888671875, "completions/mean_terminated_length": 1527.4285888671875, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.7071651090342679, "grad_norm": 0.607707679271698, "kl": 0.054172057658433914, "learning_rate": 1.723125e-06, "loss": -0.0295, "num_tokens": 58547887.0, "reward": 1.4630801677703857, "reward_std": 0.07300285249948502, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4749848544597626, "rewards/correct_reward_func/std": 0.16017581522464752, "step": 454 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2094.0, "completions/max_terminated_length": 2094.0, "completions/mean_length": 1457.2857666015625, "completions/mean_terminated_length": 1457.2857666015625, "completions/min_length": 841.0, "completions/min_terminated_length": 841.0, "epoch": 0.7087227414330218, "grad_norm": 0.6382337212562561, "kl": 0.05202684551477432, "learning_rate": 1.7224999999999998e-06, "loss": -0.0015, "num_tokens": 58676275.0, "reward": 1.4991950988769531, "reward_std": 0.10090982168912888, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5110997557640076, "rewards/correct_reward_func/std": 0.14260563254356384, "step": 455 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2076.0, "completions/mean_length": 1584.9285888671875, "completions/mean_terminated_length": 1423.7803955078125, "completions/min_length": 477.0, "completions/min_terminated_length": 477.0, "epoch": 0.7102803738317757, "grad_norm": 0.567070722579956, "kl": 0.08697609417140484, "learning_rate": 1.721875e-06, "loss": 0.104, "num_tokens": 58815337.0, "reward": 1.4610228538513184, "reward_std": 0.08824088424444199, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46102282404899597, "rewards/correct_reward_func/std": 0.17577558755874634, "step": 456 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2597.0, "completions/mean_length": 1508.1190185546875, "completions/mean_terminated_length": 1427.59033203125, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "epoch": 0.7118380062305296, "grad_norm": 0.579940140247345, "kl": 0.04937991686165333, "learning_rate": 1.7212499999999998e-06, "loss": 0.0414, "num_tokens": 58947815.0, "reward": 1.44125497341156, "reward_std": 0.09176965802907944, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4531596899032593, "rewards/correct_reward_func/std": 0.11111555248498917, "step": 457 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2106.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 1457.7261962890625, "completions/mean_terminated_length": 1457.7261962890625, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "epoch": 0.7133956386292835, "grad_norm": 0.5890275239944458, "kl": 0.04897093586623669, "learning_rate": 1.720625e-06, "loss": -0.0176, "num_tokens": 59076216.0, "reward": 1.4860827922821045, "reward_std": 0.08254723250865936, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4979875087738037, "rewards/correct_reward_func/std": 0.14583896100521088, "step": 458 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2329.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 1443.0, "completions/mean_terminated_length": 1443.0, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "epoch": 0.7149532710280374, "grad_norm": 0.6299088001251221, "kl": 0.05053492821753025, "learning_rate": 1.7199999999999998e-06, "loss": 0.0295, "num_tokens": 59203470.0, "reward": 1.4992488622665405, "reward_std": 0.08619312942028046, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5111536979675293, "rewards/correct_reward_func/std": 0.1396382600069046, "step": 459 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2038.0, "completions/max_terminated_length": 2038.0, "completions/mean_length": 1419.607177734375, "completions/mean_terminated_length": 1419.607177734375, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "epoch": 0.7165109034267912, "grad_norm": 0.6011829376220703, "kl": 0.051410723477602005, "learning_rate": 1.719375e-06, "loss": 0.0157, "num_tokens": 59328723.0, "reward": 1.5500842332839966, "reward_std": 0.06590086221694946, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5500842332839966, "rewards/correct_reward_func/std": 0.15555351972579956, "step": 460 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2246.0, "completions/mean_length": 1639.3690185546875, "completions/mean_terminated_length": 1479.5487060546875, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "epoch": 0.7180685358255452, "grad_norm": 0.5221880078315735, "kl": 0.04706592485308647, "learning_rate": 1.7187499999999998e-06, "loss": 0.1247, "num_tokens": 59472472.0, "reward": 1.4629874229431152, "reward_std": 0.11201505362987518, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47489219903945923, "rewards/correct_reward_func/std": 0.16951104998588562, "step": 461 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 1470.6785888671875, "completions/mean_terminated_length": 1470.6785888671875, "completions/min_length": 944.0, "completions/min_terminated_length": 944.0, "epoch": 0.719626168224299, "grad_norm": 0.605837345123291, "kl": 0.04970187321305275, "learning_rate": 1.7181249999999997e-06, "loss": 0.0014, "num_tokens": 59602219.0, "reward": 1.5099036693572998, "reward_std": 0.06017957627773285, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5099035501480103, "rewards/correct_reward_func/std": 0.1773725152015686, "step": 462 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2662.0, "completions/max_terminated_length": 2662.0, "completions/mean_length": 1585.297607421875, "completions/mean_terminated_length": 1585.297607421875, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "epoch": 0.721183800623053, "grad_norm": 0.5602222681045532, "kl": 0.049900198355317116, "learning_rate": 1.7174999999999999e-06, "loss": -0.0306, "num_tokens": 59741534.0, "reward": 1.505386233329773, "reward_std": 0.07570932060480118, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5053861141204834, "rewards/correct_reward_func/std": 0.11848840862512589, "step": 463 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2467.0, "completions/max_terminated_length": 2467.0, "completions/mean_length": 1453.7381591796875, "completions/mean_terminated_length": 1453.7381591796875, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "epoch": 0.7227414330218068, "grad_norm": 0.5771290063858032, "kl": 0.05098097398877144, "learning_rate": 1.7168749999999998e-06, "loss": -0.0083, "num_tokens": 59869402.0, "reward": 1.41793692111969, "reward_std": 0.07035666704177856, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41793686151504517, "rewards/correct_reward_func/std": 0.12359312176704407, "step": 464 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2362.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 1601.15478515625, "completions/mean_terminated_length": 1601.15478515625, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "epoch": 0.7242990654205608, "grad_norm": 0.5878456830978394, "kl": 0.051242388784885406, "learning_rate": 1.7162499999999999e-06, "loss": -0.0256, "num_tokens": 60010073.0, "reward": 1.5170272588729858, "reward_std": 0.08878003805875778, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5289318561553955, "rewards/correct_reward_func/std": 0.1320001482963562, "step": 465 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3673.0, "completions/max_terminated_length": 3673.0, "completions/mean_length": 1584.96435546875, "completions/mean_terminated_length": 1584.96435546875, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "epoch": 0.7258566978193146, "grad_norm": 0.56437748670578, "kl": 0.05264845862984657, "learning_rate": 1.7156249999999998e-06, "loss": -0.0124, "num_tokens": 60149144.0, "reward": 1.449500560760498, "reward_std": 0.06773830950260162, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44950050115585327, "rewards/correct_reward_func/std": 0.15649531781673431, "step": 466 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2261.0, "completions/max_terminated_length": 2261.0, "completions/mean_length": 1570.761962890625, "completions/mean_terminated_length": 1570.761962890625, "completions/min_length": 989.0, "completions/min_terminated_length": 989.0, "epoch": 0.7274143302180686, "grad_norm": 0.586487352848053, "kl": 0.05096551589667797, "learning_rate": 1.715e-06, "loss": -0.002, "num_tokens": 60287136.0, "reward": 1.5255881547927856, "reward_std": 0.08541964739561081, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5374928712844849, "rewards/correct_reward_func/std": 0.18721869587898254, "step": 467 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2230.0, "completions/max_terminated_length": 2230.0, "completions/mean_length": 1424.4405517578125, "completions/mean_terminated_length": 1424.4405517578125, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "epoch": 0.7289719626168224, "grad_norm": 0.5780627727508545, "kl": 0.05155480466783047, "learning_rate": 1.714375e-06, "loss": -0.0576, "num_tokens": 60412483.0, "reward": 1.4178261756896973, "reward_std": 0.0780172049999237, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4178261458873749, "rewards/correct_reward_func/std": 0.15439294278621674, "step": 468 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2758.0, "completions/max_terminated_length": 2758.0, "completions/mean_length": 1656.5357666015625, "completions/mean_terminated_length": 1656.5357666015625, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.7305295950155763, "grad_norm": 0.602411687374115, "kl": 0.05079780891537666, "learning_rate": 1.7137500000000001e-06, "loss": -0.0068, "num_tokens": 60557530.0, "reward": 1.455553412437439, "reward_std": 0.05027348920702934, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45555347204208374, "rewards/correct_reward_func/std": 0.08646845817565918, "step": 469 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2639.0, "completions/mean_length": 1774.96435546875, "completions/mean_terminated_length": 1697.6505126953125, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "epoch": 0.7320872274143302, "grad_norm": 0.5296007394790649, "kl": 0.04803318716585636, "learning_rate": 1.713125e-06, "loss": 0.0229, "num_tokens": 60712831.0, "reward": 1.4119234085083008, "reward_std": 0.0688636526465416, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.411923348903656, "rewards/correct_reward_func/std": 0.15681524574756622, "step": 470 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1522.75, "completions/mean_terminated_length": 1522.75, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "epoch": 0.7336448598130841, "grad_norm": 0.5558991432189941, "kl": 0.051254723221063614, "learning_rate": 1.7125e-06, "loss": 0.0205, "num_tokens": 60846634.0, "reward": 1.4586231708526611, "reward_std": 0.08559418469667435, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4705279767513275, "rewards/correct_reward_func/std": 0.13128410279750824, "step": 471 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2990.0, "completions/max_terminated_length": 2990.0, "completions/mean_length": 1663.6785888671875, "completions/mean_terminated_length": 1663.6785888671875, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.735202492211838, "grad_norm": 0.5688772797584534, "kl": 0.0507583636790514, "learning_rate": 1.711875e-06, "loss": 0.0262, "num_tokens": 60992275.0, "reward": 1.5023914575576782, "reward_std": 0.1096097081899643, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5142960548400879, "rewards/correct_reward_func/std": 0.14677509665489197, "step": 472 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2984.0, "completions/max_terminated_length": 2984.0, "completions/mean_length": 1556.7738037109375, "completions/mean_terminated_length": 1556.7738037109375, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 0.7367601246105919, "grad_norm": 0.6016846895217896, "kl": 0.05065236613154411, "learning_rate": 1.71125e-06, "loss": 0.0569, "num_tokens": 61128960.0, "reward": 1.4518747329711914, "reward_std": 0.06996078789234161, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46377936005592346, "rewards/correct_reward_func/std": 0.1257256120443344, "step": 473 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2511.0, "completions/max_terminated_length": 2511.0, "completions/mean_length": 1696.1429443359375, "completions/mean_terminated_length": 1696.1429443359375, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.7383177570093458, "grad_norm": 0.5974622964859009, "kl": 0.05056299455463886, "learning_rate": 1.710625e-06, "loss": -0.0015, "num_tokens": 61277664.0, "reward": 1.4762911796569824, "reward_std": 0.04775853455066681, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47629112005233765, "rewards/correct_reward_func/std": 0.1305771917104721, "step": 474 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2737.0, "completions/max_terminated_length": 2737.0, "completions/mean_length": 1597.011962890625, "completions/mean_terminated_length": 1597.011962890625, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 0.7398753894080997, "grad_norm": 0.5798767805099487, "kl": 0.05241680145263672, "learning_rate": 1.71e-06, "loss": 0.001, "num_tokens": 61417813.0, "reward": 1.4604065418243408, "reward_std": 0.07921247184276581, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4604065418243408, "rewards/correct_reward_func/std": 0.12199635803699493, "step": 475 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2322.0, "completions/max_terminated_length": 2322.0, "completions/mean_length": 1607.8929443359375, "completions/mean_terminated_length": 1607.8929443359375, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "epoch": 0.7414330218068536, "grad_norm": 0.5608850717544556, "kl": 0.05262966826558113, "learning_rate": 1.709375e-06, "loss": 0.0272, "num_tokens": 61558840.0, "reward": 1.429826021194458, "reward_std": 0.07498659938573837, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4417307674884796, "rewards/correct_reward_func/std": 0.16307246685028076, "step": 476 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2451.0, "completions/max_terminated_length": 2451.0, "completions/mean_length": 1651.3929443359375, "completions/mean_terminated_length": 1651.3929443359375, "completions/min_length": 886.0, "completions/min_terminated_length": 886.0, "epoch": 0.7429906542056075, "grad_norm": 0.5357276201248169, "kl": 0.05072159692645073, "learning_rate": 1.70875e-06, "loss": 0.0321, "num_tokens": 61703707.0, "reward": 1.4157724380493164, "reward_std": 0.10736193507909775, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.43958187103271484, "rewards/correct_reward_func/std": 0.12011130154132843, "step": 477 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2437.0, "completions/mean_length": 1622.2261962890625, "completions/mean_terminated_length": 1543.072265625, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "epoch": 0.7445482866043613, "grad_norm": 0.5624713897705078, "kl": 0.050551433116197586, "learning_rate": 1.7081249999999998e-06, "loss": 0.0817, "num_tokens": 61845956.0, "reward": 1.4873374700546265, "reward_std": 0.1054694801568985, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4992421567440033, "rewards/correct_reward_func/std": 0.1856101006269455, "step": 478 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7608.0, "completions/max_terminated_length": 7608.0, "completions/mean_length": 1654.2738037109375, "completions/mean_terminated_length": 1654.2738037109375, "completions/min_length": 1067.0, "completions/min_terminated_length": 1067.0, "epoch": 0.7461059190031153, "grad_norm": 0.5814685821533203, "kl": 0.05036089010536671, "learning_rate": 1.7075e-06, "loss": 0.0464, "num_tokens": 61991041.0, "reward": 1.451461911201477, "reward_std": 0.09037837386131287, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4633665978908539, "rewards/correct_reward_func/std": 0.12058395892381668, "step": 479 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2120.0, "completions/max_terminated_length": 2120.0, "completions/mean_length": 1494.3214111328125, "completions/mean_terminated_length": 1494.3214111328125, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "epoch": 0.7476635514018691, "grad_norm": 0.5886862277984619, "kl": 0.050980525091290474, "learning_rate": 1.7068749999999999e-06, "loss": 0.0151, "num_tokens": 62122528.0, "reward": 1.461233139038086, "reward_std": 0.07253991812467575, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.47313785552978516, "rewards/correct_reward_func/std": 0.14707376062870026, "step": 480 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2365.0, "completions/max_terminated_length": 2365.0, "completions/mean_length": 1512.6309814453125, "completions/mean_terminated_length": 1512.6309814453125, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 0.7492211838006231, "grad_norm": 0.5945307016372681, "kl": 0.0515163391828537, "learning_rate": 1.70625e-06, "loss": 0.0025, "num_tokens": 62255565.0, "reward": 1.5655839443206787, "reward_std": 0.07332275062799454, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5655838251113892, "rewards/correct_reward_func/std": 0.18415912985801697, "step": 481 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2281.0, "completions/max_terminated_length": 2281.0, "completions/mean_length": 1462.0357666015625, "completions/mean_terminated_length": 1462.0357666015625, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "epoch": 0.7507788161993769, "grad_norm": 0.575648307800293, "kl": 0.052306439727544785, "learning_rate": 1.7056249999999999e-06, "loss": -0.0139, "num_tokens": 62384370.0, "reward": 1.5397838354110718, "reward_std": 0.0683126300573349, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.539783775806427, "rewards/correct_reward_func/std": 0.17842523753643036, "step": 482 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 1485.84521484375, "completions/mean_terminated_length": 1485.84521484375, "completions/min_length": 810.0, "completions/min_terminated_length": 810.0, "epoch": 0.7523364485981309, "grad_norm": 0.5642300248146057, "kl": 0.05181148275732994, "learning_rate": 1.705e-06, "loss": 0.0319, "num_tokens": 62515067.0, "reward": 1.4557358026504517, "reward_std": 0.11825248599052429, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4795452356338501, "rewards/correct_reward_func/std": 0.16276678442955017, "step": 483 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2106.0, "completions/max_terminated_length": 2106.0, "completions/mean_length": 1477.65478515625, "completions/mean_terminated_length": 1477.65478515625, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "epoch": 0.7538940809968847, "grad_norm": 0.5979113578796387, "kl": 0.05194063484668732, "learning_rate": 1.7043749999999999e-06, "loss": 0.0126, "num_tokens": 62645214.0, "reward": 1.4856928586959839, "reward_std": 0.0904906839132309, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4975975453853607, "rewards/correct_reward_func/std": 0.13653963804244995, "step": 484 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2521.0, "completions/max_terminated_length": 2521.0, "completions/mean_length": 1486.6309814453125, "completions/mean_terminated_length": 1486.6309814453125, "completions/min_length": 902.0, "completions/min_terminated_length": 902.0, "epoch": 0.7554517133956387, "grad_norm": 0.6199777722358704, "kl": 0.05140496790409088, "learning_rate": 1.70375e-06, "loss": 0.0157, "num_tokens": 62776049.0, "reward": 1.491317629814148, "reward_std": 0.06931986659765244, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4913175702095032, "rewards/correct_reward_func/std": 0.14081616699695587, "step": 485 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1881.0, "completions/max_terminated_length": 1881.0, "completions/mean_length": 1389.3095703125, "completions/mean_terminated_length": 1389.3095703125, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "epoch": 0.7570093457943925, "grad_norm": 0.5803106427192688, "kl": 0.05373929440975189, "learning_rate": 1.7031249999999999e-06, "loss": -0.0063, "num_tokens": 62898547.0, "reward": 1.4282513856887817, "reward_std": 0.06992341578006744, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4282512664794922, "rewards/correct_reward_func/std": 0.14699019491672516, "step": 486 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2418.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 1418.5357666015625, "completions/mean_terminated_length": 1418.5357666015625, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "epoch": 0.7585669781931464, "grad_norm": 0.620182454586029, "kl": 0.0546103548258543, "learning_rate": 1.7024999999999998e-06, "loss": -0.0024, "num_tokens": 63023800.0, "reward": 1.4738986492156982, "reward_std": 0.11807496100664139, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4858033359050751, "rewards/correct_reward_func/std": 0.16928018629550934, "step": 487 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2352.0, "completions/max_terminated_length": 2352.0, "completions/mean_length": 1467.21435546875, "completions/mean_terminated_length": 1467.21435546875, "completions/min_length": 749.0, "completions/min_terminated_length": 749.0, "epoch": 0.7601246105919003, "grad_norm": 0.5910825729370117, "kl": 0.0529879629611969, "learning_rate": 1.701875e-06, "loss": 0.0043, "num_tokens": 63153034.0, "reward": 1.4873260259628296, "reward_std": 0.05371030792593956, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4873259365558624, "rewards/correct_reward_func/std": 0.11501560360193253, "step": 488 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2009.0, "completions/max_terminated_length": 2009.0, "completions/mean_length": 1416.6429443359375, "completions/mean_terminated_length": 1416.6429443359375, "completions/min_length": 823.0, "completions/min_terminated_length": 823.0, "epoch": 0.7616822429906542, "grad_norm": 0.617638885974884, "kl": 0.05328808352351189, "learning_rate": 1.7012499999999998e-06, "loss": 0.0089, "num_tokens": 63278134.0, "reward": 1.4918674230575562, "reward_std": 0.05592425912618637, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49186742305755615, "rewards/correct_reward_func/std": 0.11877977102994919, "step": 489 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 1438.6309814453125, "completions/mean_terminated_length": 1438.6309814453125, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "epoch": 0.7632398753894081, "grad_norm": 0.5881965160369873, "kl": 0.0523222591727972, "learning_rate": 1.700625e-06, "loss": -0.0096, "num_tokens": 63405033.0, "reward": 1.4045344591140747, "reward_std": 0.07134377211332321, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.41643914580345154, "rewards/correct_reward_func/std": 0.1567194163799286, "step": 490 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2166.0, "completions/max_terminated_length": 2166.0, "completions/mean_length": 1401.0714111328125, "completions/mean_terminated_length": 1401.0714111328125, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.764797507788162, "grad_norm": 0.6001453995704651, "kl": 0.05563800781965256, "learning_rate": 1.6999999999999998e-06, "loss": -0.0112, "num_tokens": 63528669.0, "reward": 1.4765801429748535, "reward_std": 0.0643647164106369, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47658008337020874, "rewards/correct_reward_func/std": 0.16985422372817993, "step": 491 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 1454.0357666015625, "completions/mean_terminated_length": 1372.8553466796875, "completions/min_length": 921.0, "completions/min_terminated_length": 921.0, "epoch": 0.7663551401869159, "grad_norm": 0.6344577670097351, "kl": 0.05234198831021786, "learning_rate": 1.699375e-06, "loss": 0.0894, "num_tokens": 63656826.0, "reward": 1.5414061546325684, "reward_std": 0.08393041044473648, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5533110499382019, "rewards/correct_reward_func/std": 0.12277739495038986, "step": 492 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1342.84521484375, "completions/mean_terminated_length": 1342.84521484375, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.7679127725856698, "grad_norm": 0.6125787496566772, "kl": 0.05459017679095268, "learning_rate": 1.6987499999999998e-06, "loss": -0.0266, "num_tokens": 63775661.0, "reward": 1.463148832321167, "reward_std": 0.0493975505232811, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46314874291419983, "rewards/correct_reward_func/std": 0.12720361351966858, "step": 493 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 1410.2261962890625, "completions/mean_terminated_length": 1410.2261962890625, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 0.7694704049844237, "grad_norm": 0.625629186630249, "kl": 0.05353173241019249, "learning_rate": 1.698125e-06, "loss": 0.0163, "num_tokens": 63900228.0, "reward": 1.5133934020996094, "reward_std": 0.10190200060606003, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5252981781959534, "rewards/correct_reward_func/std": 0.13644695281982422, "step": 494 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2018.0, "completions/max_terminated_length": 2018.0, "completions/mean_length": 1312.297607421875, "completions/mean_terminated_length": 1312.297607421875, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "epoch": 0.7710280373831776, "grad_norm": 0.6696212887763977, "kl": 0.05529572255909443, "learning_rate": 1.6974999999999998e-06, "loss": -0.0171, "num_tokens": 64016269.0, "reward": 1.464273452758789, "reward_std": 0.07633471488952637, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4642733931541443, "rewards/correct_reward_func/std": 0.1434505134820938, "step": 495 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2636.0, "completions/max_terminated_length": 2636.0, "completions/mean_length": 1418.7738037109375, "completions/mean_terminated_length": 1418.7738037109375, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "epoch": 0.7725856697819314, "grad_norm": 0.6065709590911865, "kl": 0.05418024770915508, "learning_rate": 1.6968749999999997e-06, "loss": -0.03, "num_tokens": 64141350.0, "reward": 1.5015569925308228, "reward_std": 0.0749620795249939, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5015567541122437, "rewards/correct_reward_func/std": 0.1574825942516327, "step": 496 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2235.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1359.4761962890625, "completions/mean_terminated_length": 1359.4761962890625, "completions/min_length": 744.0, "completions/min_terminated_length": 744.0, "epoch": 0.7741433021806854, "grad_norm": 0.6393604278564453, "kl": 0.053761230781674385, "learning_rate": 1.6962499999999999e-06, "loss": -0.0042, "num_tokens": 64261606.0, "reward": 1.5147299766540527, "reward_std": 0.08370784670114517, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5266348719596863, "rewards/correct_reward_func/std": 0.12140747904777527, "step": 497 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2030.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 1397.642822265625, "completions/mean_terminated_length": 1397.642822265625, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 0.7757009345794392, "grad_norm": 0.5982325673103333, "kl": 0.05507444404065609, "learning_rate": 1.695625e-06, "loss": 0.0236, "num_tokens": 64385140.0, "reward": 1.4505058526992798, "reward_std": 0.06045551598072052, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4505058526992798, "rewards/correct_reward_func/std": 0.16863283514976501, "step": 498 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 1413.607177734375, "completions/mean_terminated_length": 1413.607177734375, "completions/min_length": 897.0, "completions/min_terminated_length": 897.0, "epoch": 0.7772585669781932, "grad_norm": 0.6396889686584473, "kl": 0.0517488569021225, "learning_rate": 1.695e-06, "loss": 0.0036, "num_tokens": 64510087.0, "reward": 1.5013822317123413, "reward_std": 0.05566215515136719, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5013821721076965, "rewards/correct_reward_func/std": 0.1984904706478119, "step": 499 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1967.0, "completions/max_terminated_length": 1967.0, "completions/mean_length": 1331.75, "completions/mean_terminated_length": 1331.75, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "epoch": 0.778816199376947, "grad_norm": 0.6182188391685486, "kl": 0.05334976129233837, "learning_rate": 1.694375e-06, "loss": 0.0177, "num_tokens": 64627834.0, "reward": 1.4577831029891968, "reward_std": 0.07913817465305328, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46968796849250793, "rewards/correct_reward_func/std": 0.1922857016324997, "step": 500 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 1450.797607421875, "completions/mean_terminated_length": 1450.797607421875, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "epoch": 0.780373831775701, "grad_norm": 0.6251698136329651, "kl": 0.051690295338630676, "learning_rate": 1.69375e-06, "loss": 0.0195, "num_tokens": 64755809.0, "reward": 1.4815540313720703, "reward_std": 0.06203337013721466, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48155394196510315, "rewards/correct_reward_func/std": 0.14881980419158936, "step": 501 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2222.0, "completions/max_terminated_length": 2222.0, "completions/mean_length": 1361.0, "completions/mean_terminated_length": 1361.0, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "epoch": 0.7819314641744548, "grad_norm": 0.6154322624206543, "kl": 0.05483602173626423, "learning_rate": 1.693125e-06, "loss": -0.0267, "num_tokens": 64876139.0, "reward": 1.434553861618042, "reward_std": 0.1056680679321289, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44645848870277405, "rewards/correct_reward_func/std": 0.15556204319000244, "step": 502 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2066.0, "completions/max_terminated_length": 2066.0, "completions/mean_length": 1336.75, "completions/mean_terminated_length": 1336.75, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.7834890965732088, "grad_norm": 0.6085004210472107, "kl": 0.05451551079750061, "learning_rate": 1.6924999999999999e-06, "loss": 0.027, "num_tokens": 64994438.0, "reward": 1.5156744718551636, "reward_std": 0.043764952570199966, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5156744718551636, "rewards/correct_reward_func/std": 0.10536623001098633, "step": 503 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2154.0, "completions/max_terminated_length": 2154.0, "completions/mean_length": 1349.65478515625, "completions/mean_terminated_length": 1349.65478515625, "completions/min_length": 831.0, "completions/min_terminated_length": 831.0, "epoch": 0.7850467289719626, "grad_norm": 0.6269933581352234, "kl": 0.05425166338682175, "learning_rate": 1.691875e-06, "loss": 0.005, "num_tokens": 65113491.0, "reward": 1.4136245250701904, "reward_std": 0.07355698943138123, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.41362443566322327, "rewards/correct_reward_func/std": 0.11681222915649414, "step": 504 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6143.0, "completions/max_terminated_length": 6143.0, "completions/mean_length": 1480.0357666015625, "completions/mean_terminated_length": 1480.0357666015625, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 0.7866043613707165, "grad_norm": 0.58085036277771, "kl": 0.05101562291383743, "learning_rate": 1.69125e-06, "loss": 0.0136, "num_tokens": 65243952.0, "reward": 1.4428439140319824, "reward_std": 0.10061752051115036, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.45474860072135925, "rewards/correct_reward_func/std": 0.15178316831588745, "step": 505 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2496.0, "completions/mean_length": 1478.357177734375, "completions/mean_terminated_length": 1397.4698486328125, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 0.7881619937694704, "grad_norm": 0.614782989025116, "kl": 0.05181491747498512, "learning_rate": 1.690625e-06, "loss": 0.0927, "num_tokens": 65374230.0, "reward": 1.5134276151657104, "reward_std": 0.10299229621887207, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5253323912620544, "rewards/correct_reward_func/std": 0.13395552337169647, "step": 506 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1895.0, "completions/max_terminated_length": 1895.0, "completions/mean_length": 1314.7381591796875, "completions/mean_terminated_length": 1314.7381591796875, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.7897196261682243, "grad_norm": 0.6423413753509521, "kl": 0.052083175629377365, "learning_rate": 1.69e-06, "loss": -0.0164, "num_tokens": 65490740.0, "reward": 1.4805521965026855, "reward_std": 0.07352635264396667, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4924568831920624, "rewards/correct_reward_func/std": 0.14243850111961365, "step": 507 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2152.0, "completions/mean_length": 1539.2738037109375, "completions/mean_terminated_length": 1459.1204833984375, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 0.7912772585669782, "grad_norm": 0.6012693643569946, "kl": 0.052487269043922424, "learning_rate": 1.689375e-06, "loss": 0.0457, "num_tokens": 65626057.0, "reward": 1.493817687034607, "reward_std": 0.06720510125160217, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49381768703460693, "rewards/correct_reward_func/std": 0.141060933470726, "step": 508 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 1402.7261962890625, "completions/mean_terminated_length": 1320.9276123046875, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.7928348909657321, "grad_norm": 0.610801100730896, "kl": 0.053645048290491104, "learning_rate": 1.68875e-06, "loss": 0.0464, "num_tokens": 65749730.0, "reward": 1.4257547855377197, "reward_std": 0.12775346636772156, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.44956421852111816, "rewards/correct_reward_func/std": 0.128119558095932, "step": 509 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2260.0, "completions/max_terminated_length": 2260.0, "completions/mean_length": 1397.3095703125, "completions/mean_terminated_length": 1397.3095703125, "completions/min_length": 715.0, "completions/min_terminated_length": 715.0, "epoch": 0.794392523364486, "grad_norm": 0.6136677265167236, "kl": 0.05390959791839123, "learning_rate": 1.688125e-06, "loss": 0.01, "num_tokens": 65873218.0, "reward": 1.447425127029419, "reward_std": 0.07631354033946991, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4593297839164734, "rewards/correct_reward_func/std": 0.13860400021076202, "step": 510 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1830.0, "completions/max_terminated_length": 1830.0, "completions/mean_length": 1334.452392578125, "completions/mean_terminated_length": 1334.452392578125, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 0.7959501557632399, "grad_norm": 0.6296293139457703, "kl": 0.05470990762114525, "learning_rate": 1.6875e-06, "loss": 0.018, "num_tokens": 65991102.0, "reward": 1.4921993017196655, "reward_std": 0.09120924770832062, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5041038990020752, "rewards/correct_reward_func/std": 0.20131434500217438, "step": 511 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2360.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 1491.107177734375, "completions/mean_terminated_length": 1491.107177734375, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "epoch": 0.7975077881619937, "grad_norm": 0.6053344011306763, "kl": 0.054690854623913765, "learning_rate": 1.6868749999999998e-06, "loss": 0.0082, "num_tokens": 66122301.0, "reward": 1.5253815650939941, "reward_std": 0.0556306354701519, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5253814458847046, "rewards/correct_reward_func/std": 0.19547365605831146, "step": 512 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2043.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 1394.857177734375, "completions/mean_terminated_length": 1394.857177734375, "completions/min_length": 842.0, "completions/min_terminated_length": 842.0, "epoch": 0.7990654205607477, "grad_norm": 0.6043696403503418, "kl": 0.05226844176650047, "learning_rate": 1.68625e-06, "loss": 0.0163, "num_tokens": 66245343.0, "reward": 1.5012750625610352, "reward_std": 0.07726840674877167, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5012750625610352, "rewards/correct_reward_func/std": 0.17448803782463074, "step": 513 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 1384.1785888671875, "completions/mean_terminated_length": 1384.1785888671875, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 0.8006230529595015, "grad_norm": 0.5974183678627014, "kl": 0.055330896750092506, "learning_rate": 1.6856249999999998e-06, "loss": -0.0029, "num_tokens": 66367602.0, "reward": 1.39756441116333, "reward_std": 0.10132217407226562, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.42137381434440613, "rewards/correct_reward_func/std": 0.16763533651828766, "step": 514 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2356.0, "completions/max_terminated_length": 2356.0, "completions/mean_length": 1507.5833740234375, "completions/mean_terminated_length": 1507.5833740234375, "completions/min_length": 941.0, "completions/min_terminated_length": 941.0, "epoch": 0.8021806853582555, "grad_norm": 0.5618709325790405, "kl": 0.05384498089551926, "learning_rate": 1.685e-06, "loss": 0.0189, "num_tokens": 66500467.0, "reward": 1.504148006439209, "reward_std": 0.057246141135692596, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5041479468345642, "rewards/correct_reward_func/std": 0.1694769561290741, "step": 515 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3970.0, "completions/max_terminated_length": 3970.0, "completions/mean_length": 1491.3095703125, "completions/mean_terminated_length": 1491.3095703125, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.8037383177570093, "grad_norm": 0.6067160964012146, "kl": 0.05185644514858723, "learning_rate": 1.6843749999999999e-06, "loss": 0.0155, "num_tokens": 66631719.0, "reward": 1.4592036008834839, "reward_std": 0.0799928829073906, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4711082875728607, "rewards/correct_reward_func/std": 0.12382801622152328, "step": 516 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2280.0, "completions/max_terminated_length": 2280.0, "completions/mean_length": 1441.6905517578125, "completions/mean_terminated_length": 1441.6905517578125, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.8052959501557633, "grad_norm": 0.5906907916069031, "kl": 0.05282064713537693, "learning_rate": 1.68375e-06, "loss": 0.0161, "num_tokens": 66758611.0, "reward": 1.4480403661727905, "reward_std": 0.07139705866575241, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4599449932575226, "rewards/correct_reward_func/std": 0.18760444223880768, "step": 517 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2303.0, "completions/max_terminated_length": 2303.0, "completions/mean_length": 1492.5357666015625, "completions/mean_terminated_length": 1492.5357666015625, "completions/min_length": 753.0, "completions/min_terminated_length": 753.0, "epoch": 0.8068535825545171, "grad_norm": 0.5900249481201172, "kl": 0.05465748719871044, "learning_rate": 1.6831249999999999e-06, "loss": 0.0236, "num_tokens": 66889984.0, "reward": 1.473251223564148, "reward_std": 0.05754239857196808, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4732511639595032, "rewards/correct_reward_func/std": 0.16302239894866943, "step": 518 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2189.0, "completions/max_terminated_length": 2189.0, "completions/mean_length": 1433.2381591796875, "completions/mean_terminated_length": 1433.2381591796875, "completions/min_length": 794.0, "completions/min_terminated_length": 794.0, "epoch": 0.8084112149532711, "grad_norm": 0.6327292323112488, "kl": 0.05179595574736595, "learning_rate": 1.6825e-06, "loss": -0.0127, "num_tokens": 67016334.0, "reward": 1.517919898033142, "reward_std": 0.10886523127555847, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5417292714118958, "rewards/correct_reward_func/std": 0.18844658136367798, "step": 519 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2372.0, "completions/max_terminated_length": 2372.0, "completions/mean_length": 1455.261962890625, "completions/mean_terminated_length": 1455.261962890625, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 0.8099688473520249, "grad_norm": 0.5824243426322937, "kl": 0.05402742512524128, "learning_rate": 1.6818749999999999e-06, "loss": 0.0003, "num_tokens": 67144684.0, "reward": 1.4728202819824219, "reward_std": 0.07268865406513214, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48472505807876587, "rewards/correct_reward_func/std": 0.1330195963382721, "step": 520 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1930.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 1356.7738037109375, "completions/mean_terminated_length": 1356.7738037109375, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "epoch": 0.8115264797507789, "grad_norm": 0.635526716709137, "kl": 0.052402498200535774, "learning_rate": 1.6812499999999998e-06, "loss": -0.0018, "num_tokens": 67264455.0, "reward": 1.4374048709869385, "reward_std": 0.10275428742170334, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44930967688560486, "rewards/correct_reward_func/std": 0.12510575354099274, "step": 521 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2142.0, "completions/max_terminated_length": 2142.0, "completions/mean_length": 1410.011962890625, "completions/mean_terminated_length": 1410.011962890625, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "epoch": 0.8130841121495327, "grad_norm": 0.6527436971664429, "kl": 0.061864860355854034, "learning_rate": 1.680625e-06, "loss": -0.007, "num_tokens": 67388722.0, "reward": 1.4815458059310913, "reward_std": 0.06229028478264809, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.481545627117157, "rewards/correct_reward_func/std": 0.18914058804512024, "step": 522 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2120.0, "completions/max_terminated_length": 2120.0, "completions/mean_length": 1409.8809814453125, "completions/mean_terminated_length": 1409.8809814453125, "completions/min_length": 44.0, "completions/min_terminated_length": 44.0, "epoch": 0.8146417445482866, "grad_norm": 0.5648652911186218, "kl": 0.05158809758722782, "learning_rate": 1.6799999999999998e-06, "loss": -0.0055, "num_tokens": 67513224.0, "reward": 1.4905917644500732, "reward_std": 0.0897277295589447, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5024964213371277, "rewards/correct_reward_func/std": 0.1823735535144806, "step": 523 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2069.0, "completions/max_terminated_length": 2069.0, "completions/mean_length": 1410.6190185546875, "completions/mean_terminated_length": 1410.6190185546875, "completions/min_length": 615.0, "completions/min_terminated_length": 615.0, "epoch": 0.8161993769470405, "grad_norm": 0.601411759853363, "kl": 0.0548630990087986, "learning_rate": 1.679375e-06, "loss": 0.0227, "num_tokens": 67637728.0, "reward": 1.4363572597503662, "reward_std": 0.08451084047555923, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44826188683509827, "rewards/correct_reward_func/std": 0.17532704770565033, "step": 524 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 1413.547607421875, "completions/mean_terminated_length": 1413.547607421875, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "epoch": 0.8177570093457944, "grad_norm": 0.619880735874176, "kl": 0.0535897146910429, "learning_rate": 1.6787499999999998e-06, "loss": 0.0125, "num_tokens": 67762508.0, "reward": 1.4487403631210327, "reward_std": 0.0894000232219696, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46064507961273193, "rewards/correct_reward_func/std": 0.14411649107933044, "step": 525 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2225.0, "completions/max_terminated_length": 2225.0, "completions/mean_length": 1396.892822265625, "completions/mean_terminated_length": 1396.892822265625, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.8193146417445483, "grad_norm": 0.5552582144737244, "kl": 0.052618470042943954, "learning_rate": 1.678125e-06, "loss": 0.0085, "num_tokens": 67885985.0, "reward": 1.513651967048645, "reward_std": 0.07783416658639908, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5255565643310547, "rewards/correct_reward_func/std": 0.18098370730876923, "step": 526 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2145.0, "completions/max_terminated_length": 2145.0, "completions/mean_length": 1407.4285888671875, "completions/mean_terminated_length": 1407.4285888671875, "completions/min_length": 803.0, "completions/min_terminated_length": 803.0, "epoch": 0.8208722741433022, "grad_norm": 0.5891075730323792, "kl": 0.05191943235695362, "learning_rate": 1.6774999999999998e-06, "loss": -0.0255, "num_tokens": 68010101.0, "reward": 1.531822919845581, "reward_std": 0.0646144449710846, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5318229794502258, "rewards/correct_reward_func/std": 0.13647998869419098, "step": 527 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2089.0, "completions/max_terminated_length": 2089.0, "completions/mean_length": 1330.71435546875, "completions/mean_terminated_length": 1330.71435546875, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 0.822429906542056, "grad_norm": 0.6207326054573059, "kl": 0.05244195647537708, "learning_rate": 1.6768749999999997e-06, "loss": -0.0223, "num_tokens": 68127731.0, "reward": 1.5092251300811768, "reward_std": 0.05685145780444145, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5092251300811768, "rewards/correct_reward_func/std": 0.1656326800584793, "step": 528 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2121.0, "completions/max_terminated_length": 2121.0, "completions/mean_length": 1391.7381591796875, "completions/mean_terminated_length": 1391.7381591796875, "completions/min_length": 651.0, "completions/min_terminated_length": 651.0, "epoch": 0.82398753894081, "grad_norm": 0.5846036076545715, "kl": 0.05157465487718582, "learning_rate": 1.67625e-06, "loss": 0.0272, "num_tokens": 68250595.0, "reward": 1.480483889579773, "reward_std": 0.06157321855425835, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4804837703704834, "rewards/correct_reward_func/std": 0.12125560641288757, "step": 529 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 1558.21435546875, "completions/mean_terminated_length": 1478.2890625, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "epoch": 0.8255451713395638, "grad_norm": 0.5714659094810486, "kl": 0.04993342235684395, "learning_rate": 1.675625e-06, "loss": 0.0461, "num_tokens": 68387617.0, "reward": 1.4617350101470947, "reward_std": 0.062348198145627975, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4617350101470947, "rewards/correct_reward_func/std": 0.1364985555410385, "step": 530 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2211.0, "completions/max_terminated_length": 2211.0, "completions/mean_length": 1433.452392578125, "completions/mean_terminated_length": 1433.452392578125, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 0.8271028037383178, "grad_norm": 0.5750571489334106, "kl": 0.05183848738670349, "learning_rate": 1.675e-06, "loss": -0.012, "num_tokens": 68513937.0, "reward": 1.5087705850601196, "reward_std": 0.07611233741044998, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5087705254554749, "rewards/correct_reward_func/std": 0.17911018431186676, "step": 531 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2028.0, "completions/max_terminated_length": 2028.0, "completions/mean_length": 1391.5357666015625, "completions/mean_terminated_length": 1391.5357666015625, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.8286604361370716, "grad_norm": 0.6533128023147583, "kl": 0.05436134710907936, "learning_rate": 1.674375e-06, "loss": 0.0264, "num_tokens": 68636850.0, "reward": 1.51654851436615, "reward_std": 0.09789982438087463, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5284532904624939, "rewards/correct_reward_func/std": 0.15198057889938354, "step": 532 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2412.0, "completions/max_terminated_length": 2412.0, "completions/mean_length": 1433.857177734375, "completions/mean_terminated_length": 1433.857177734375, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 0.8302180685358256, "grad_norm": 0.5955455303192139, "kl": 0.055306799709796906, "learning_rate": 1.67375e-06, "loss": -0.0119, "num_tokens": 68763192.0, "reward": 1.379233717918396, "reward_std": 0.04899342358112335, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.37923356890678406, "rewards/correct_reward_func/std": 0.1353388875722885, "step": 533 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2564.0, "completions/max_terminated_length": 2564.0, "completions/mean_length": 1472.8095703125, "completions/mean_terminated_length": 1472.8095703125, "completions/min_length": 511.0, "completions/min_terminated_length": 511.0, "epoch": 0.8317757009345794, "grad_norm": 0.5779551267623901, "kl": 0.05130494572222233, "learning_rate": 1.673125e-06, "loss": 0.0082, "num_tokens": 68893040.0, "reward": 1.4051659107208252, "reward_std": 0.049869608134031296, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4051658511161804, "rewards/correct_reward_func/std": 0.14183004200458527, "step": 534 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1413.2857666015625, "completions/mean_terminated_length": 1413.2857666015625, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "epoch": 0.8333333333333334, "grad_norm": 0.6209075450897217, "kl": 0.05292078107595444, "learning_rate": 1.6725e-06, "loss": -0.0059, "num_tokens": 69017690.0, "reward": 1.5440659523010254, "reward_std": 0.06486238539218903, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5440659523010254, "rewards/correct_reward_func/std": 0.1593974381685257, "step": 535 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2176.0, "completions/max_terminated_length": 2176.0, "completions/mean_length": 1482.3809814453125, "completions/mean_terminated_length": 1482.3809814453125, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "epoch": 0.8348909657320872, "grad_norm": 0.6003454923629761, "kl": 0.05392787978053093, "learning_rate": 1.671875e-06, "loss": 0.0123, "num_tokens": 69148084.0, "reward": 1.436340093612671, "reward_std": 0.07149424403905869, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4363400638103485, "rewards/correct_reward_func/std": 0.14438651502132416, "step": 536 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 1409.46435546875, "completions/mean_terminated_length": 1409.46435546875, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.8364485981308412, "grad_norm": 0.6742749214172363, "kl": 0.053148942068219185, "learning_rate": 1.6712499999999999e-06, "loss": -0.0079, "num_tokens": 69272203.0, "reward": 1.4142444133758545, "reward_std": 0.10058359056711197, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.43805375695228577, "rewards/correct_reward_func/std": 0.17819122970104218, "step": 537 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2199.0, "completions/mean_length": 1534.0833740234375, "completions/mean_terminated_length": 1453.867431640625, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 0.838006230529595, "grad_norm": 0.572339653968811, "kl": 0.051896609365940094, "learning_rate": 1.670625e-06, "loss": 0.0649, "num_tokens": 69406928.0, "reward": 1.4281702041625977, "reward_std": 0.08568203449249268, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44007474184036255, "rewards/correct_reward_func/std": 0.1391279697418213, "step": 538 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3857.0, "completions/max_terminated_length": 3857.0, "completions/mean_length": 1528.0, "completions/mean_terminated_length": 1528.0, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 0.839563862928349, "grad_norm": 0.558670699596405, "kl": 0.05065500736236572, "learning_rate": 1.6699999999999999e-06, "loss": 0.0049, "num_tokens": 69541424.0, "reward": 1.416330337524414, "reward_std": 0.12241604179143906, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4401398301124573, "rewards/correct_reward_func/std": 0.12226840853691101, "step": 539 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2426.0, "completions/max_terminated_length": 2426.0, "completions/mean_length": 1439.107177734375, "completions/mean_terminated_length": 1439.107177734375, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "epoch": 0.8411214953271028, "grad_norm": 0.6253755688667297, "kl": 0.05376381799578667, "learning_rate": 1.669375e-06, "loss": 0.0018, "num_tokens": 69668123.0, "reward": 1.4674383401870728, "reward_std": 0.0582718625664711, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4674382507801056, "rewards/correct_reward_func/std": 0.17001482844352722, "step": 540 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2122.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1451.797607421875, "completions/mean_terminated_length": 1451.797607421875, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "epoch": 0.8426791277258567, "grad_norm": 0.5441909432411194, "kl": 0.05154600366950035, "learning_rate": 1.66875e-06, "loss": -0.0146, "num_tokens": 69795798.0, "reward": 1.5086621046066284, "reward_std": 0.045781608670949936, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5086619853973389, "rewards/correct_reward_func/std": 0.15602315962314606, "step": 541 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2118.0, "completions/max_terminated_length": 2118.0, "completions/mean_length": 1410.3809814453125, "completions/mean_terminated_length": 1410.3809814453125, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "epoch": 0.8442367601246106, "grad_norm": 0.5914663076400757, "kl": 0.053898200392723083, "learning_rate": 1.668125e-06, "loss": -0.02, "num_tokens": 69920132.0, "reward": 1.4829809665679932, "reward_std": 0.05561475455760956, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4829808175563812, "rewards/correct_reward_func/std": 0.14528672397136688, "step": 542 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2454.0, "completions/max_terminated_length": 2454.0, "completions/mean_length": 1559.607177734375, "completions/mean_terminated_length": 1559.607177734375, "completions/min_length": 901.0, "completions/min_terminated_length": 901.0, "epoch": 0.8457943925233645, "grad_norm": 0.5803366303443909, "kl": 0.05089765228331089, "learning_rate": 1.6675e-06, "loss": 0.024, "num_tokens": 70057187.0, "reward": 1.5257433652877808, "reward_std": 0.07609397917985916, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.525743305683136, "rewards/correct_reward_func/std": 0.15661990642547607, "step": 543 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2234.0, "completions/max_terminated_length": 2234.0, "completions/mean_length": 1478.202392578125, "completions/mean_terminated_length": 1478.202392578125, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.8473520249221184, "grad_norm": 0.631567120552063, "kl": 0.051157766953110695, "learning_rate": 1.666875e-06, "loss": -0.0001, "num_tokens": 70187470.0, "reward": 1.4295824766159058, "reward_std": 0.06356283277273178, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.42958250641822815, "rewards/correct_reward_func/std": 0.164540097117424, "step": 544 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2417.0, "completions/max_terminated_length": 2417.0, "completions/mean_length": 1515.0238037109375, "completions/mean_terminated_length": 1515.0238037109375, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "epoch": 0.8489096573208723, "grad_norm": 0.5900546908378601, "kl": 0.05033543519675732, "learning_rate": 1.66625e-06, "loss": -0.0113, "num_tokens": 70320696.0, "reward": 1.4821819067001343, "reward_std": 0.0709303766489029, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4940865933895111, "rewards/correct_reward_func/std": 0.1473175436258316, "step": 545 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2307.0, "completions/max_terminated_length": 2307.0, "completions/mean_length": 1536.761962890625, "completions/mean_terminated_length": 1536.761962890625, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "epoch": 0.8504672897196262, "grad_norm": 0.5679816007614136, "kl": 0.05010136775672436, "learning_rate": 1.6656249999999998e-06, "loss": -0.0173, "num_tokens": 70455964.0, "reward": 1.5630290508270264, "reward_std": 0.06558456271886826, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5630288124084473, "rewards/correct_reward_func/std": 0.16733138263225555, "step": 546 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 1501.6429443359375, "completions/mean_terminated_length": 1501.6429443359375, "completions/min_length": 979.0, "completions/min_terminated_length": 979.0, "epoch": 0.8520249221183801, "grad_norm": 0.5807083249092102, "kl": 0.05040537752211094, "learning_rate": 1.665e-06, "loss": 0.0092, "num_tokens": 70588168.0, "reward": 1.4890462160110474, "reward_std": 0.050529684871435165, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.489046186208725, "rewards/correct_reward_func/std": 0.11799110472202301, "step": 547 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2537.0, "completions/max_terminated_length": 2537.0, "completions/mean_length": 1504.90478515625, "completions/mean_terminated_length": 1504.90478515625, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "epoch": 0.8535825545171339, "grad_norm": 0.5985187292098999, "kl": 0.0527173038572073, "learning_rate": 1.6643749999999998e-06, "loss": 0.0376, "num_tokens": 70720526.0, "reward": 1.4671262502670288, "reward_std": 0.09527470171451569, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4790307879447937, "rewards/correct_reward_func/std": 0.2005491405725479, "step": 548 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2135.0, "completions/max_terminated_length": 2135.0, "completions/mean_length": 1473.0, "completions/mean_terminated_length": 1473.0, "completions/min_length": 635.0, "completions/min_terminated_length": 635.0, "epoch": 0.8551401869158879, "grad_norm": 0.6216426491737366, "kl": 0.0518038310110569, "learning_rate": 1.66375e-06, "loss": 0.002, "num_tokens": 70850270.0, "reward": 1.4148383140563965, "reward_std": 0.13262715935707092, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4386478662490845, "rewards/correct_reward_func/std": 0.1450170874595642, "step": 549 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2297.0, "completions/mean_length": 1617.7738037109375, "completions/mean_terminated_length": 1538.566162109375, "completions/min_length": 1021.0, "completions/min_terminated_length": 1021.0, "epoch": 0.8566978193146417, "grad_norm": 0.5692617893218994, "kl": 0.04915030300617218, "learning_rate": 1.6631249999999999e-06, "loss": 0.0763, "num_tokens": 70992001.0, "reward": 1.4859832525253296, "reward_std": 0.08638235181570053, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.49788784980773926, "rewards/correct_reward_func/std": 0.1480027735233307, "step": 550 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2063.0, "completions/mean_length": 1563.7261962890625, "completions/mean_terminated_length": 1483.867431640625, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "epoch": 0.8582554517133957, "grad_norm": 0.5355828404426575, "kl": 0.04826325178146362, "learning_rate": 1.6625e-06, "loss": 0.0394, "num_tokens": 71129288.0, "reward": 1.4876452684402466, "reward_std": 0.05155961960554123, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4876452386379242, "rewards/correct_reward_func/std": 0.17587290704250336, "step": 551 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2185.0, "completions/max_terminated_length": 2185.0, "completions/mean_length": 1465.4761962890625, "completions/mean_terminated_length": 1465.4761962890625, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 0.8598130841121495, "grad_norm": 0.6027195453643799, "kl": 0.051084551960229874, "learning_rate": 1.6618749999999999e-06, "loss": -0.0266, "num_tokens": 71258208.0, "reward": 1.48594069480896, "reward_std": 0.06709278374910355, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4859406650066376, "rewards/correct_reward_func/std": 0.16938088834285736, "step": 552 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4262.0, "completions/max_terminated_length": 4262.0, "completions/mean_length": 1517.666748046875, "completions/mean_terminated_length": 1517.666748046875, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 0.8613707165109035, "grad_norm": 0.580878734588623, "kl": 0.04972629249095917, "learning_rate": 1.6612499999999998e-06, "loss": -0.0144, "num_tokens": 71391566.0, "reward": 1.4727210998535156, "reward_std": 0.06501049548387527, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4727211892604828, "rewards/correct_reward_func/std": 0.17857278883457184, "step": 553 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1985.0, "completions/max_terminated_length": 1985.0, "completions/mean_length": 1515.7857666015625, "completions/mean_terminated_length": 1515.7857666015625, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "epoch": 0.8629283489096573, "grad_norm": 0.599119246006012, "kl": 0.050645509734749794, "learning_rate": 1.6606249999999999e-06, "loss": -0.0003, "num_tokens": 71524916.0, "reward": 1.4626071453094482, "reward_std": 0.053476471453905106, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46260714530944824, "rewards/correct_reward_func/std": 0.16813401877880096, "step": 554 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5913.0, "completions/max_terminated_length": 5913.0, "completions/mean_length": 1586.047607421875, "completions/mean_terminated_length": 1586.047607421875, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "epoch": 0.8644859813084113, "grad_norm": 0.547502338886261, "kl": 0.04839299060404301, "learning_rate": 1.6599999999999998e-06, "loss": 0.0048, "num_tokens": 71664126.0, "reward": 1.568305253982544, "reward_std": 0.08509069681167603, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5802100300788879, "rewards/correct_reward_func/std": 0.1785019338130951, "step": 555 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 1738.5238037109375, "completions/mean_terminated_length": 1581.1219482421875, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "epoch": 0.8660436137071651, "grad_norm": 0.5003688931465149, "kl": 0.047175006940960884, "learning_rate": 1.6593749999999999e-06, "loss": 0.072, "num_tokens": 71816114.0, "reward": 1.487537145614624, "reward_std": 0.078678660094738, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4875370264053345, "rewards/correct_reward_func/std": 0.17857803404331207, "step": 556 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2373.0, "completions/max_terminated_length": 2373.0, "completions/mean_length": 1579.0595703125, "completions/mean_terminated_length": 1579.0595703125, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 0.867601246105919, "grad_norm": 0.583847165107727, "kl": 0.04886885918676853, "learning_rate": 1.6587499999999998e-06, "loss": 0.0038, "num_tokens": 71954809.0, "reward": 1.4350553750991821, "reward_std": 0.06607770174741745, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43505528569221497, "rewards/correct_reward_func/std": 0.10001393407583237, "step": 557 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2380.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 1577.0, "completions/mean_terminated_length": 1577.0, "completions/min_length": 924.0, "completions/min_terminated_length": 924.0, "epoch": 0.8691588785046729, "grad_norm": 0.5692570805549622, "kl": 0.05144515633583069, "learning_rate": 1.658125e-06, "loss": -0.0243, "num_tokens": 72093283.0, "reward": 1.4471040964126587, "reward_std": 0.06054630130529404, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44710394740104675, "rewards/correct_reward_func/std": 0.1185460090637207, "step": 558 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2167.0, "completions/mean_length": 1629.9285888671875, "completions/mean_terminated_length": 1550.867431640625, "completions/min_length": 830.0, "completions/min_terminated_length": 830.0, "epoch": 0.8707165109034268, "grad_norm": 0.5377056002616882, "kl": 0.049677252769470215, "learning_rate": 1.6574999999999998e-06, "loss": 0.0604, "num_tokens": 72236281.0, "reward": 1.4650782346725464, "reward_std": 0.08549048751592636, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4650781750679016, "rewards/correct_reward_func/std": 0.13779646158218384, "step": 559 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2994.0, "completions/max_terminated_length": 2994.0, "completions/mean_length": 1595.09521484375, "completions/mean_terminated_length": 1595.09521484375, "completions/min_length": 921.0, "completions/min_terminated_length": 921.0, "epoch": 0.8722741433021807, "grad_norm": 0.5444644689559937, "kl": 0.053564492613077164, "learning_rate": 1.6568750000000001e-06, "loss": -0.0056, "num_tokens": 72376257.0, "reward": 1.464841604232788, "reward_std": 0.07330876588821411, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4648415148258209, "rewards/correct_reward_func/std": 0.13325951993465424, "step": 560 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2346.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 1595.761962890625, "completions/mean_terminated_length": 1595.761962890625, "completions/min_length": 1008.0, "completions/min_terminated_length": 1008.0, "epoch": 0.8738317757009346, "grad_norm": 0.5756959915161133, "kl": 0.0488431490957737, "learning_rate": 1.65625e-06, "loss": -0.0016, "num_tokens": 72516427.0, "reward": 1.5044291019439697, "reward_std": 0.05856965854763985, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5044289827346802, "rewards/correct_reward_func/std": 0.16118833422660828, "step": 561 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 1523.202392578125, "completions/mean_terminated_length": 1523.202392578125, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "epoch": 0.8753894080996885, "grad_norm": 0.6184421181678772, "kl": 0.051321882754564285, "learning_rate": 1.655625e-06, "loss": 0.0021, "num_tokens": 72650334.0, "reward": 1.439370036125183, "reward_std": 0.12014901638031006, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4512746334075928, "rewards/correct_reward_func/std": 0.13923780620098114, "step": 562 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2735.0, "completions/max_terminated_length": 2735.0, "completions/mean_length": 1580.0595703125, "completions/mean_terminated_length": 1580.0595703125, "completions/min_length": 1087.0, "completions/min_terminated_length": 1087.0, "epoch": 0.8769470404984424, "grad_norm": 0.5475049018859863, "kl": 0.05060616135597229, "learning_rate": 1.655e-06, "loss": 0.0094, "num_tokens": 72789155.0, "reward": 1.4986872673034668, "reward_std": 0.041443560272455215, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4986870288848877, "rewards/correct_reward_func/std": 0.1334153264760971, "step": 563 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2607.0, "completions/mean_length": 1658.9881591796875, "completions/mean_terminated_length": 1580.277099609375, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 0.8785046728971962, "grad_norm": 0.5355792045593262, "kl": 0.0508806686848402, "learning_rate": 1.654375e-06, "loss": -0.0215, "num_tokens": 72934552.0, "reward": 1.425096035003662, "reward_std": 0.0804082602262497, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4370007812976837, "rewards/correct_reward_func/std": 0.12526416778564453, "step": 564 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 1626.2857666015625, "completions/mean_terminated_length": 1626.2857666015625, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "epoch": 0.8800623052959502, "grad_norm": 0.5639548897743225, "kl": 0.05000521242618561, "learning_rate": 1.65375e-06, "loss": 0.0054, "num_tokens": 73077196.0, "reward": 1.4919753074645996, "reward_std": 0.06127806007862091, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49197524785995483, "rewards/correct_reward_func/std": 0.1505451202392578, "step": 565 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2300.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 1541.8690185546875, "completions/mean_terminated_length": 1541.8690185546875, "completions/min_length": 839.0, "completions/min_terminated_length": 839.0, "epoch": 0.881619937694704, "grad_norm": 0.5418336987495422, "kl": 0.04926094599068165, "learning_rate": 1.653125e-06, "loss": -0.0078, "num_tokens": 73212797.0, "reward": 1.5520154237747192, "reward_std": 0.06705118715763092, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5520154237747192, "rewards/correct_reward_func/std": 0.22710636258125305, "step": 566 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2350.0, "completions/max_terminated_length": 2350.0, "completions/mean_length": 1580.0357666015625, "completions/mean_terminated_length": 1580.0357666015625, "completions/min_length": 948.0, "completions/min_terminated_length": 948.0, "epoch": 0.883177570093458, "grad_norm": 0.5915489196777344, "kl": 0.05472877249121666, "learning_rate": 1.6525e-06, "loss": -0.0068, "num_tokens": 73351538.0, "reward": 1.450407862663269, "reward_std": 0.10079541802406311, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.47421735525131226, "rewards/correct_reward_func/std": 0.12341609597206116, "step": 567 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2387.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 1513.6905517578125, "completions/mean_terminated_length": 1513.6905517578125, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "epoch": 0.8847352024922118, "grad_norm": 0.600501537322998, "kl": 0.05078642629086971, "learning_rate": 1.651875e-06, "loss": -0.0143, "num_tokens": 73484532.0, "reward": 1.485834002494812, "reward_std": 0.07049893587827682, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4858340322971344, "rewards/correct_reward_func/std": 0.16757294535636902, "step": 568 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3088.0, "completions/max_terminated_length": 3088.0, "completions/mean_length": 1561.09521484375, "completions/mean_terminated_length": 1561.09521484375, "completions/min_length": 974.0, "completions/min_terminated_length": 974.0, "epoch": 0.8862928348909658, "grad_norm": 0.641735851764679, "kl": 0.052700335159897804, "learning_rate": 1.65125e-06, "loss": -0.0118, "num_tokens": 73621460.0, "reward": 1.4019627571105957, "reward_std": 0.089509978890419, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4138675034046173, "rewards/correct_reward_func/std": 0.20820264518260956, "step": 569 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2686.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 1680.702392578125, "completions/mean_terminated_length": 1680.702392578125, "completions/min_length": 1087.0, "completions/min_terminated_length": 1087.0, "epoch": 0.8878504672897196, "grad_norm": 0.5752266049385071, "kl": 0.05081222578883171, "learning_rate": 1.650625e-06, "loss": 0.001, "num_tokens": 73768867.0, "reward": 1.428143858909607, "reward_std": 0.08753962814807892, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.440048485994339, "rewards/correct_reward_func/std": 0.14757801592350006, "step": 570 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2271.0, "completions/max_terminated_length": 2271.0, "completions/mean_length": 1525.011962890625, "completions/mean_terminated_length": 1525.011962890625, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 0.8894080996884736, "grad_norm": 0.5790801644325256, "kl": 0.050931330770254135, "learning_rate": 1.6499999999999999e-06, "loss": -0.0014, "num_tokens": 73902932.0, "reward": 1.5280438661575317, "reward_std": 0.07709922641515732, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5399484634399414, "rewards/correct_reward_func/std": 0.19332382082939148, "step": 571 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2723.0, "completions/max_terminated_length": 2723.0, "completions/mean_length": 1548.2857666015625, "completions/mean_terminated_length": 1548.2857666015625, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "epoch": 0.8909657320872274, "grad_norm": 0.5646366477012634, "kl": 0.05197379179298878, "learning_rate": 1.649375e-06, "loss": 0.0077, "num_tokens": 74038952.0, "reward": 1.399301290512085, "reward_std": 0.0882752537727356, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4112059772014618, "rewards/correct_reward_func/std": 0.13882336020469666, "step": 572 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2738.0, "completions/max_terminated_length": 2738.0, "completions/mean_length": 1620.15478515625, "completions/mean_terminated_length": 1620.15478515625, "completions/min_length": 1027.0, "completions/min_terminated_length": 1027.0, "epoch": 0.8925233644859814, "grad_norm": 0.5650038719177246, "kl": 0.0501062236726284, "learning_rate": 1.6487499999999999e-06, "loss": 0.0136, "num_tokens": 74180907.0, "reward": 1.4877279996871948, "reward_std": 0.06202785298228264, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48772794008255005, "rewards/correct_reward_func/std": 0.1389545202255249, "step": 573 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2306.0, "completions/max_terminated_length": 2306.0, "completions/mean_length": 1473.297607421875, "completions/mean_terminated_length": 1473.297607421875, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "epoch": 0.8940809968847352, "grad_norm": 0.5704150199890137, "kl": 0.05233505181968212, "learning_rate": 1.648125e-06, "loss": -0.0144, "num_tokens": 74310478.0, "reward": 1.524600625038147, "reward_std": 0.10687962174415588, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.536505401134491, "rewards/correct_reward_func/std": 0.17535418272018433, "step": 574 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2972.0, "completions/max_terminated_length": 2972.0, "completions/mean_length": 1529.8095703125, "completions/mean_terminated_length": 1529.8095703125, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "epoch": 0.8956386292834891, "grad_norm": 0.583591878414154, "kl": 0.05231664888560772, "learning_rate": 1.6475e-06, "loss": -0.0175, "num_tokens": 74444784.0, "reward": 1.4119700193405151, "reward_std": 0.08519253879785538, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.42387470602989197, "rewards/correct_reward_func/std": 0.1410830169916153, "step": 575 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3040.0, "completions/mean_length": 1685.96435546875, "completions/mean_terminated_length": 1607.5782470703125, "completions/min_length": 1009.0, "completions/min_terminated_length": 1009.0, "epoch": 0.897196261682243, "grad_norm": 0.5724698901176453, "kl": 0.05051821656525135, "learning_rate": 1.646875e-06, "loss": 0.0665, "num_tokens": 74592453.0, "reward": 1.543835997581482, "reward_std": 0.059640318155288696, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5438359379768372, "rewards/correct_reward_func/std": 0.19233205914497375, "step": 576 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2392.0, "completions/max_terminated_length": 2392.0, "completions/mean_length": 1596.0, "completions/mean_terminated_length": 1596.0, "completions/min_length": 1028.0, "completions/min_terminated_length": 1028.0, "epoch": 0.8987538940809969, "grad_norm": 0.5430770516395569, "kl": 0.05068780109286308, "learning_rate": 1.64625e-06, "loss": 0.003, "num_tokens": 74732313.0, "reward": 1.4892923831939697, "reward_std": 0.045956652611494064, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4892924129962921, "rewards/correct_reward_func/std": 0.13337074220180511, "step": 577 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2648.0, "completions/max_terminated_length": 2648.0, "completions/mean_length": 1619.3333740234375, "completions/mean_terminated_length": 1619.3333740234375, "completions/min_length": 1108.0, "completions/min_terminated_length": 1108.0, "epoch": 0.9003115264797508, "grad_norm": 0.5669228434562683, "kl": 0.05153697915375233, "learning_rate": 1.6456249999999998e-06, "loss": -0.0006, "num_tokens": 74874247.0, "reward": 1.4417579174041748, "reward_std": 0.08493451774120331, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.453662633895874, "rewards/correct_reward_func/std": 0.14506934583187103, "step": 578 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2490.0, "completions/max_terminated_length": 2490.0, "completions/mean_length": 1587.84521484375, "completions/mean_terminated_length": 1587.84521484375, "completions/min_length": 998.0, "completions/min_terminated_length": 998.0, "epoch": 0.9018691588785047, "grad_norm": 0.5580478310585022, "kl": 0.0523674376308918, "learning_rate": 1.645e-06, "loss": 0.018, "num_tokens": 75013500.0, "reward": 1.4614784717559814, "reward_std": 0.06435896456241608, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46147841215133667, "rewards/correct_reward_func/std": 0.11859949678182602, "step": 579 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3307.0, "completions/max_terminated_length": 3307.0, "completions/mean_length": 1637.8214111328125, "completions/mean_terminated_length": 1637.8214111328125, "completions/min_length": 1092.0, "completions/min_terminated_length": 1092.0, "epoch": 0.9034267912772586, "grad_norm": 0.540749192237854, "kl": 0.05217336490750313, "learning_rate": 1.6443749999999998e-06, "loss": 0.005, "num_tokens": 75157203.0, "reward": 1.4611413478851318, "reward_std": 0.10950693488121033, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4849506914615631, "rewards/correct_reward_func/std": 0.16065186262130737, "step": 580 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2337.0, "completions/max_terminated_length": 2337.0, "completions/mean_length": 1549.2261962890625, "completions/mean_terminated_length": 1549.2261962890625, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "epoch": 0.9049844236760125, "grad_norm": 0.6054486632347107, "kl": 0.05162344500422478, "learning_rate": 1.64375e-06, "loss": 0.0224, "num_tokens": 75293266.0, "reward": 1.4400663375854492, "reward_std": 0.048954516649246216, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4400663375854492, "rewards/correct_reward_func/std": 0.1594221442937851, "step": 581 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2236.0, "completions/max_terminated_length": 2236.0, "completions/mean_length": 1517.6785888671875, "completions/mean_terminated_length": 1517.6785888671875, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "epoch": 0.9065420560747663, "grad_norm": 0.5864580273628235, "kl": 0.052071839570999146, "learning_rate": 1.6431249999999998e-06, "loss": 0.0076, "num_tokens": 75426745.0, "reward": 1.4933863878250122, "reward_std": 0.054050467908382416, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4933864176273346, "rewards/correct_reward_func/std": 0.17375320196151733, "step": 582 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2700.0, "completions/max_terminated_length": 2700.0, "completions/mean_length": 1574.7857666015625, "completions/mean_terminated_length": 1574.7857666015625, "completions/min_length": 740.0, "completions/min_terminated_length": 740.0, "epoch": 0.9080996884735203, "grad_norm": 0.5605589747428894, "kl": 0.05352449230849743, "learning_rate": 1.6425e-06, "loss": -0.0052, "num_tokens": 75565075.0, "reward": 1.488409399986267, "reward_std": 0.11526400595903397, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5122188329696655, "rewards/correct_reward_func/std": 0.14258237183094025, "step": 583 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2662.0, "completions/max_terminated_length": 2662.0, "completions/mean_length": 1574.011962890625, "completions/mean_terminated_length": 1574.011962890625, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "epoch": 0.9096573208722741, "grad_norm": 0.5379892587661743, "kl": 0.05358175188302994, "learning_rate": 1.6418749999999998e-06, "loss": -0.0107, "num_tokens": 75703280.0, "reward": 1.4819163084030151, "reward_std": 0.11179199814796448, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5057256817817688, "rewards/correct_reward_func/std": 0.16328661143779755, "step": 584 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2656.0, "completions/max_terminated_length": 2656.0, "completions/mean_length": 1526.857177734375, "completions/mean_terminated_length": 1526.857177734375, "completions/min_length": 1027.0, "completions/min_terminated_length": 1027.0, "epoch": 0.9112149532710281, "grad_norm": 0.5592092871665955, "kl": 0.052205150946974754, "learning_rate": 1.64125e-06, "loss": -0.0203, "num_tokens": 75837530.0, "reward": 1.4071049690246582, "reward_std": 0.15914778411388397, "rewards/contains_chinese/mean": 0.9523809552192688, "rewards/contains_chinese/std": 0.21423791348934174, "rewards/correct_reward_func/mean": 0.454723984003067, "rewards/correct_reward_func/std": 0.16848695278167725, "step": 585 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2317.0, "completions/max_terminated_length": 2317.0, "completions/mean_length": 1569.4881591796875, "completions/mean_terminated_length": 1569.4881591796875, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 0.9127725856697819, "grad_norm": 0.5686935782432556, "kl": 0.05123456381261349, "learning_rate": 1.6406249999999999e-06, "loss": 0.0122, "num_tokens": 75975445.0, "reward": 1.4490768909454346, "reward_std": 0.06693350523710251, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4490768313407898, "rewards/correct_reward_func/std": 0.16664773225784302, "step": 586 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2338.0, "completions/max_terminated_length": 2338.0, "completions/mean_length": 1549.047607421875, "completions/mean_terminated_length": 1549.047607421875, "completions/min_length": 873.0, "completions/min_terminated_length": 873.0, "epoch": 0.9143302180685359, "grad_norm": 0.566740095615387, "kl": 0.05220544897019863, "learning_rate": 1.6399999999999998e-06, "loss": -0.0109, "num_tokens": 76111565.0, "reward": 1.464043378829956, "reward_std": 0.12711408734321594, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4878527820110321, "rewards/correct_reward_func/std": 0.16010543704032898, "step": 587 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2122.0, "completions/mean_length": 1619.8214111328125, "completions/mean_terminated_length": 1540.6385498046875, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "epoch": 0.9158878504672897, "grad_norm": 0.5762239694595337, "kl": 0.052646003663539886, "learning_rate": 1.6393749999999999e-06, "loss": 0.0575, "num_tokens": 76253672.0, "reward": 1.441072702407837, "reward_std": 0.0739678293466568, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44107261300086975, "rewards/correct_reward_func/std": 0.12623170018196106, "step": 588 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2457.0, "completions/max_terminated_length": 2457.0, "completions/mean_length": 1510.1309814453125, "completions/mean_terminated_length": 1510.1309814453125, "completions/min_length": 1069.0, "completions/min_terminated_length": 1069.0, "epoch": 0.9174454828660437, "grad_norm": 0.5868603587150574, "kl": 0.05117998085916042, "learning_rate": 1.6387499999999998e-06, "loss": -0.0027, "num_tokens": 76386439.0, "reward": 1.5747073888778687, "reward_std": 0.06914177536964417, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5747074484825134, "rewards/correct_reward_func/std": 0.14264759421348572, "step": 589 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2371.0, "completions/max_terminated_length": 2371.0, "completions/mean_length": 1601.357177734375, "completions/mean_terminated_length": 1601.357177734375, "completions/min_length": 1061.0, "completions/min_terminated_length": 1061.0, "epoch": 0.9190031152647975, "grad_norm": 0.5775259137153625, "kl": 0.05480557680130005, "learning_rate": 1.6381249999999999e-06, "loss": 0.008, "num_tokens": 76527043.0, "reward": 1.4700545072555542, "reward_std": 0.08241315186023712, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4819592535495758, "rewards/correct_reward_func/std": 0.14215821027755737, "step": 590 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2647.0, "completions/max_terminated_length": 2647.0, "completions/mean_length": 1482.797607421875, "completions/mean_terminated_length": 1482.797607421875, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 0.9205607476635514, "grad_norm": 0.599651575088501, "kl": 0.05351861007511616, "learning_rate": 1.6374999999999998e-06, "loss": -0.0187, "num_tokens": 76657436.0, "reward": 1.5297892093658447, "reward_std": 0.06223803758621216, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5297890901565552, "rewards/correct_reward_func/std": 0.18300145864486694, "step": 591 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 1613.2738037109375, "completions/mean_terminated_length": 1613.2738037109375, "completions/min_length": 1017.0, "completions/min_terminated_length": 1017.0, "epoch": 0.9221183800623053, "grad_norm": 0.5727816224098206, "kl": 0.0537562221288681, "learning_rate": 1.636875e-06, "loss": 0.012, "num_tokens": 76798939.0, "reward": 1.5009434223175049, "reward_std": 0.08991846442222595, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.5247528553009033, "rewards/correct_reward_func/std": 0.18158107995986938, "step": 592 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2359.0, "completions/mean_length": 1659.5714111328125, "completions/mean_terminated_length": 1580.867431640625, "completions/min_length": 1085.0, "completions/min_terminated_length": 1085.0, "epoch": 0.9236760124610592, "grad_norm": 0.5455081462860107, "kl": 0.05132809653878212, "learning_rate": 1.63625e-06, "loss": 0.0699, "num_tokens": 76944613.0, "reward": 1.4970442056655884, "reward_std": 0.09464308619499207, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5089487433433533, "rewards/correct_reward_func/std": 0.16302646696567535, "step": 593 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2390.0, "completions/max_terminated_length": 2390.0, "completions/mean_length": 1536.5238037109375, "completions/mean_terminated_length": 1536.5238037109375, "completions/min_length": 719.0, "completions/min_terminated_length": 719.0, "epoch": 0.9252336448598131, "grad_norm": 0.5868439078330994, "kl": 0.05363127030432224, "learning_rate": 1.6356250000000001e-06, "loss": -0.0331, "num_tokens": 77079573.0, "reward": 1.4771671295166016, "reward_std": 0.05991039052605629, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4771670699119568, "rewards/correct_reward_func/std": 0.1585981547832489, "step": 594 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2357.0, "completions/max_terminated_length": 2357.0, "completions/mean_length": 1595.8095703125, "completions/mean_terminated_length": 1595.8095703125, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.926791277258567, "grad_norm": 0.5364544987678528, "kl": 0.055180374532938004, "learning_rate": 1.635e-06, "loss": 0.0001, "num_tokens": 77219441.0, "reward": 1.4917380809783936, "reward_std": 0.09097646921873093, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4917379915714264, "rewards/correct_reward_func/std": 0.16991373896598816, "step": 595 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2668.0, "completions/max_terminated_length": 2668.0, "completions/mean_length": 1606.0833740234375, "completions/mean_terminated_length": 1606.0833740234375, "completions/min_length": 1030.0, "completions/min_terminated_length": 1030.0, "epoch": 0.9283489096573209, "grad_norm": 0.5445214509963989, "kl": 0.05271473526954651, "learning_rate": 1.634375e-06, "loss": -0.023, "num_tokens": 77360292.0, "reward": 1.4440690279006958, "reward_std": 0.048418521881103516, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44406890869140625, "rewards/correct_reward_func/std": 0.16575324535369873, "step": 596 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2422.0, "completions/max_terminated_length": 2422.0, "completions/mean_length": 1495.71435546875, "completions/mean_terminated_length": 1495.71435546875, "completions/min_length": 1008.0, "completions/min_terminated_length": 1008.0, "epoch": 0.9299065420560748, "grad_norm": 0.5676112771034241, "kl": 0.05709120258688927, "learning_rate": 1.63375e-06, "loss": -0.0011, "num_tokens": 77491650.0, "reward": 1.5036193132400513, "reward_std": 0.07075376063585281, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5036192536354065, "rewards/correct_reward_func/std": 0.15129926800727844, "step": 597 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2412.0, "completions/max_terminated_length": 2412.0, "completions/mean_length": 1607.1785888671875, "completions/mean_terminated_length": 1607.1785888671875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.9314641744548287, "grad_norm": 0.5542066097259521, "kl": 0.05464941821992397, "learning_rate": 1.633125e-06, "loss": 0.0103, "num_tokens": 77632623.0, "reward": 1.4994677305221558, "reward_std": 0.09316570311784744, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.511372447013855, "rewards/correct_reward_func/std": 0.10848263651132584, "step": 598 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2396.0, "completions/max_terminated_length": 2396.0, "completions/mean_length": 1543.1309814453125, "completions/mean_terminated_length": 1543.1309814453125, "completions/min_length": 375.0, "completions/min_terminated_length": 375.0, "epoch": 0.9330218068535826, "grad_norm": 0.5920741558074951, "kl": 0.05645397678017616, "learning_rate": 1.6325e-06, "loss": 0.0179, "num_tokens": 77768348.0, "reward": 1.4137533903121948, "reward_std": 0.130188450217247, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.44946759939193726, "rewards/correct_reward_func/std": 0.1394752562046051, "step": 599 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2292.0, "completions/max_terminated_length": 2292.0, "completions/mean_length": 1543.8214111328125, "completions/mean_terminated_length": 1543.8214111328125, "completions/min_length": 1063.0, "completions/min_terminated_length": 1063.0, "epoch": 0.9345794392523364, "grad_norm": 0.629948079586029, "kl": 0.05816573277115822, "learning_rate": 1.631875e-06, "loss": 0.008, "num_tokens": 77903795.0, "reward": 1.4406551122665405, "reward_std": 0.08957747370004654, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44065502285957336, "rewards/correct_reward_func/std": 0.16707439720630646, "step": 600 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2268.0, "completions/mean_length": 1753.5714111328125, "completions/mean_terminated_length": 1675.9998779296875, "completions/min_length": 1085.0, "completions/min_terminated_length": 1085.0, "epoch": 0.9361370716510904, "grad_norm": 0.596502959728241, "kl": 0.05688577890396118, "learning_rate": 1.63125e-06, "loss": 0.0596, "num_tokens": 78057203.0, "reward": 1.4966117143630981, "reward_std": 0.08040610700845718, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4966115951538086, "rewards/correct_reward_func/std": 0.22688382863998413, "step": 601 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3153.0, "completions/max_terminated_length": 3153.0, "completions/mean_length": 1590.25, "completions/mean_terminated_length": 1590.25, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 0.9376947040498442, "grad_norm": 0.5286832451820374, "kl": 0.05895489826798439, "learning_rate": 1.630625e-06, "loss": -0.0303, "num_tokens": 78196856.0, "reward": 1.4874906539916992, "reward_std": 0.044962868094444275, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48749059438705444, "rewards/correct_reward_func/std": 0.1546032726764679, "step": 602 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2431.0, "completions/max_terminated_length": 2431.0, "completions/mean_length": 1600.21435546875, "completions/mean_terminated_length": 1600.21435546875, "completions/min_length": 681.0, "completions/min_terminated_length": 681.0, "epoch": 0.9392523364485982, "grad_norm": 0.5734366178512573, "kl": 0.05501212365925312, "learning_rate": 1.6299999999999999e-06, "loss": 0.0089, "num_tokens": 78337274.0, "reward": 1.5072236061096191, "reward_std": 0.05490497127175331, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5072234869003296, "rewards/correct_reward_func/std": 0.11040540784597397, "step": 603 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2413.0, "completions/max_terminated_length": 2413.0, "completions/mean_length": 1633.90478515625, "completions/mean_terminated_length": 1633.90478515625, "completions/min_length": 1092.0, "completions/min_terminated_length": 1092.0, "epoch": 0.940809968847352, "grad_norm": 0.5251239538192749, "kl": 0.059018656611442566, "learning_rate": 1.629375e-06, "loss": -0.0014, "num_tokens": 78480546.0, "reward": 1.4861642122268677, "reward_std": 0.06594221293926239, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4980688989162445, "rewards/correct_reward_func/std": 0.16121798753738403, "step": 604 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2623.0, "completions/max_terminated_length": 2623.0, "completions/mean_length": 1652.1309814453125, "completions/mean_terminated_length": 1652.1309814453125, "completions/min_length": 776.0, "completions/min_terminated_length": 776.0, "epoch": 0.942367601246106, "grad_norm": 0.5935388803482056, "kl": 0.051552364602684975, "learning_rate": 1.6287499999999999e-06, "loss": 0.0244, "num_tokens": 78625445.0, "reward": 1.4352984428405762, "reward_std": 0.13022708892822266, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669144809246063, "rewards/correct_reward_func/mean": 0.47101256251335144, "rewards/correct_reward_func/std": 0.1250392645597458, "step": 605 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3299.0, "completions/mean_length": 1707.4405517578125, "completions/mean_terminated_length": 1629.313232421875, "completions/min_length": 1127.0, "completions/min_terminated_length": 1127.0, "epoch": 0.9439252336448598, "grad_norm": 0.6000062227249146, "kl": 0.054089561104774475, "learning_rate": 1.628125e-06, "loss": 0.0647, "num_tokens": 78774852.0, "reward": 1.4837253093719482, "reward_std": 0.07214730232954025, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4837252199649811, "rewards/correct_reward_func/std": 0.1362723410129547, "step": 606 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2870.0, "completions/mean_length": 1669.357177734375, "completions/mean_terminated_length": 1590.77099609375, "completions/min_length": 1059.0, "completions/min_terminated_length": 1059.0, "epoch": 0.9454828660436138, "grad_norm": 0.5420730113983154, "kl": 0.051561569795012474, "learning_rate": 1.6274999999999999e-06, "loss": 0.0154, "num_tokens": 78921096.0, "reward": 1.472241759300232, "reward_std": 0.10751719772815704, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4841463565826416, "rewards/correct_reward_func/std": 0.1544354408979416, "step": 607 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2360.0, "completions/max_terminated_length": 2360.0, "completions/mean_length": 1506.25, "completions/mean_terminated_length": 1506.25, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 0.9470404984423676, "grad_norm": 0.6013126373291016, "kl": 0.05470665171742439, "learning_rate": 1.626875e-06, "loss": 0.0038, "num_tokens": 79053429.0, "reward": 1.5111743211746216, "reward_std": 0.06690473109483719, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5111743211746216, "rewards/correct_reward_func/std": 0.18495941162109375, "step": 608 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2235.0, "completions/mean_length": 1583.166748046875, "completions/mean_terminated_length": 1503.5421142578125, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "epoch": 0.9485981308411215, "grad_norm": 0.5386534929275513, "kl": 0.051652608439326286, "learning_rate": 1.6262499999999999e-06, "loss": 0.056, "num_tokens": 79192355.0, "reward": 1.4510307312011719, "reward_std": 0.0934084877371788, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4629353880882263, "rewards/correct_reward_func/std": 0.14852337539196014, "step": 609 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2729.0, "completions/max_terminated_length": 2729.0, "completions/mean_length": 1621.202392578125, "completions/mean_terminated_length": 1621.202392578125, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.9501557632398754, "grad_norm": 0.6040880680084229, "kl": 0.053494108840823174, "learning_rate": 1.625625e-06, "loss": -0.0353, "num_tokens": 79334620.0, "reward": 1.4715975522994995, "reward_std": 0.0786285549402237, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4715975821018219, "rewards/correct_reward_func/std": 0.11298926919698715, "step": 610 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2967.0, "completions/max_terminated_length": 2967.0, "completions/mean_length": 1519.4285888671875, "completions/mean_terminated_length": 1519.4285888671875, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 0.9517133956386293, "grad_norm": 0.5844832062721252, "kl": 0.0530572235584259, "learning_rate": 1.625e-06, "loss": 0.0093, "num_tokens": 79468030.0, "reward": 1.5320613384246826, "reward_std": 0.05157333239912987, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5320611596107483, "rewards/correct_reward_func/std": 0.14916525781154633, "step": 611 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2165.0, "completions/max_terminated_length": 2165.0, "completions/mean_length": 1606.5, "completions/mean_terminated_length": 1606.5, "completions/min_length": 862.0, "completions/min_terminated_length": 862.0, "epoch": 0.9532710280373832, "grad_norm": 0.5951868295669556, "kl": 0.05439838580787182, "learning_rate": 1.6243749999999998e-06, "loss": -0.0238, "num_tokens": 79608940.0, "reward": 1.4870160818099976, "reward_std": 0.1107514277100563, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4989207088947296, "rewards/correct_reward_func/std": 0.1661691665649414, "step": 612 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2785.0, "completions/max_terminated_length": 2785.0, "completions/mean_length": 1537.761962890625, "completions/mean_terminated_length": 1537.761962890625, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "epoch": 0.9548286604361371, "grad_norm": 0.5835762023925781, "kl": 0.054533904418349266, "learning_rate": 1.62375e-06, "loss": 0.0028, "num_tokens": 79744142.0, "reward": 1.4982998371124268, "reward_std": 0.08925885707139969, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5102044939994812, "rewards/correct_reward_func/std": 0.21719758212566376, "step": 613 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2433.0, "completions/max_terminated_length": 2433.0, "completions/mean_length": 1523.2261962890625, "completions/mean_terminated_length": 1523.2261962890625, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "epoch": 0.956386292834891, "grad_norm": 0.5852372646331787, "kl": 0.05277659185230732, "learning_rate": 1.6231249999999998e-06, "loss": -0.0142, "num_tokens": 79878015.0, "reward": 1.458331823348999, "reward_std": 0.0705060064792633, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4583317041397095, "rewards/correct_reward_func/std": 0.1471869796514511, "step": 614 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2318.0, "completions/max_terminated_length": 2318.0, "completions/mean_length": 1595.261962890625, "completions/mean_terminated_length": 1595.261962890625, "completions/min_length": 1045.0, "completions/min_terminated_length": 1045.0, "epoch": 0.9579439252336449, "grad_norm": 0.5480629801750183, "kl": 0.0494478065520525, "learning_rate": 1.6225e-06, "loss": -0.0132, "num_tokens": 80017915.0, "reward": 1.4836761951446533, "reward_std": 0.06662869453430176, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4836761951446533, "rewards/correct_reward_func/std": 0.1571483314037323, "step": 615 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2420.0, "completions/max_terminated_length": 2420.0, "completions/mean_length": 1565.5238037109375, "completions/mean_terminated_length": 1565.5238037109375, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "epoch": 0.9595015576323987, "grad_norm": 0.5563946962356567, "kl": 0.05082782916724682, "learning_rate": 1.6218749999999998e-06, "loss": 0.0008, "num_tokens": 80155491.0, "reward": 1.5200374126434326, "reward_std": 0.0669645443558693, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5200372934341431, "rewards/correct_reward_func/std": 0.18864794075489044, "step": 616 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 1500.7381591796875, "completions/mean_terminated_length": 1500.7381591796875, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "epoch": 0.9610591900311527, "grad_norm": 0.5845383405685425, "kl": 0.051540493965148926, "learning_rate": 1.62125e-06, "loss": -0.0132, "num_tokens": 80287547.0, "reward": 1.5222618579864502, "reward_std": 0.08891423046588898, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5341665148735046, "rewards/correct_reward_func/std": 0.1643657386302948, "step": 617 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2304.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 1515.0357666015625, "completions/mean_terminated_length": 1515.0357666015625, "completions/min_length": 872.0, "completions/min_terminated_length": 872.0, "epoch": 0.9626168224299065, "grad_norm": 0.6092395782470703, "kl": 0.05296482518315315, "learning_rate": 1.6206249999999998e-06, "loss": 0.0086, "num_tokens": 80420648.0, "reward": 1.4399313926696777, "reward_std": 0.08079881966114044, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.43993115425109863, "rewards/correct_reward_func/std": 0.13497735559940338, "step": 618 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2472.0, "completions/max_terminated_length": 2472.0, "completions/mean_length": 1513.666748046875, "completions/mean_terminated_length": 1513.666748046875, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.9641744548286605, "grad_norm": 0.5746808052062988, "kl": 0.05054004117846489, "learning_rate": 1.62e-06, "loss": -0.0325, "num_tokens": 80553754.0, "reward": 1.478222370147705, "reward_std": 0.05955832451581955, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4782223105430603, "rewards/correct_reward_func/std": 0.13707558810710907, "step": 619 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3365.0, "completions/max_terminated_length": 3365.0, "completions/mean_length": 1535.46435546875, "completions/mean_terminated_length": 1535.46435546875, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "epoch": 0.9657320872274143, "grad_norm": 0.6188405752182007, "kl": 0.05118212662637234, "learning_rate": 1.6193749999999998e-06, "loss": 0.068, "num_tokens": 80688679.0, "reward": 1.4972518682479858, "reward_std": 0.10318266600370407, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.521061360836029, "rewards/correct_reward_func/std": 0.15406495332717896, "step": 620 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2192.0, "completions/mean_length": 1599.0357666015625, "completions/mean_terminated_length": 1519.602294921875, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "epoch": 0.9672897196261683, "grad_norm": 0.5300514698028564, "kl": 0.04921773634850979, "learning_rate": 1.6187499999999997e-06, "loss": 0.0581, "num_tokens": 80829022.0, "reward": 1.4415113925933838, "reward_std": 0.08664444088935852, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44151124358177185, "rewards/correct_reward_func/std": 0.12956391274929047, "step": 621 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2215.0, "completions/mean_length": 1577.96435546875, "completions/mean_terminated_length": 1498.277099609375, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "epoch": 0.9688473520249221, "grad_norm": 0.585616409778595, "kl": 0.0514204315841198, "learning_rate": 1.6181249999999999e-06, "loss": 0.0696, "num_tokens": 80967613.0, "reward": 1.4526182413101196, "reward_std": 0.07754890620708466, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45261818170547485, "rewards/correct_reward_func/std": 0.14640595018863678, "step": 622 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 1603.0714111328125, "completions/mean_terminated_length": 1523.6866455078125, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "epoch": 0.9704049844236761, "grad_norm": 0.6005199551582336, "kl": 0.05098399519920349, "learning_rate": 1.6174999999999998e-06, "loss": 0.0495, "num_tokens": 81108259.0, "reward": 1.4635608196258545, "reward_std": 0.09107384085655212, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4754655957221985, "rewards/correct_reward_func/std": 0.12346034497022629, "step": 623 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2418.0, "completions/max_terminated_length": 2418.0, "completions/mean_length": 1519.261962890625, "completions/mean_terminated_length": 1519.261962890625, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "epoch": 0.9719626168224299, "grad_norm": 0.6062273383140564, "kl": 0.049668088555336, "learning_rate": 1.616875e-06, "loss": 0.031, "num_tokens": 81241739.0, "reward": 1.4660530090332031, "reward_std": 0.062135469168424606, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4660530090332031, "rewards/correct_reward_func/std": 0.12143100053071976, "step": 624 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2461.0, "completions/max_terminated_length": 2461.0, "completions/mean_length": 1491.5714111328125, "completions/mean_terminated_length": 1491.5714111328125, "completions/min_length": 906.0, "completions/min_terminated_length": 906.0, "epoch": 0.9735202492211839, "grad_norm": 0.6340486407279968, "kl": 0.05301540531218052, "learning_rate": 1.61625e-06, "loss": -0.0211, "num_tokens": 81372875.0, "reward": 1.487614393234253, "reward_std": 0.0698120966553688, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48761430382728577, "rewards/correct_reward_func/std": 0.13601452112197876, "step": 625 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2163.0, "completions/max_terminated_length": 2163.0, "completions/mean_length": 1590.8929443359375, "completions/mean_terminated_length": 1590.8929443359375, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 0.9750778816199377, "grad_norm": 0.5444204807281494, "kl": 0.0505395382642746, "learning_rate": 1.615625e-06, "loss": 0.0167, "num_tokens": 81512804.0, "reward": 1.4724253416061401, "reward_std": 0.05580959469079971, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47242528200149536, "rewards/correct_reward_func/std": 0.12042959779500961, "step": 626 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2706.0, "completions/max_terminated_length": 2706.0, "completions/mean_length": 1554.75, "completions/mean_terminated_length": 1554.75, "completions/min_length": 795.0, "completions/min_terminated_length": 795.0, "epoch": 0.9766355140186916, "grad_norm": 0.5865889191627502, "kl": 0.05015707015991211, "learning_rate": 1.615e-06, "loss": 0.004, "num_tokens": 81649517.0, "reward": 1.4877718687057495, "reward_std": 0.08064839243888855, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4996766448020935, "rewards/correct_reward_func/std": 0.1834038645029068, "step": 627 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2110.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 1443.4761962890625, "completions/mean_terminated_length": 1443.4761962890625, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 0.9781931464174455, "grad_norm": 0.5513234734535217, "kl": 0.054139742627739906, "learning_rate": 1.614375e-06, "loss": 0.006, "num_tokens": 81776583.0, "reward": 1.5102035999298096, "reward_std": 0.05765219032764435, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5102035999298096, "rewards/correct_reward_func/std": 0.1545766443014145, "step": 628 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2619.0, "completions/max_terminated_length": 2619.0, "completions/mean_length": 1546.21435546875, "completions/mean_terminated_length": 1546.21435546875, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "epoch": 0.9797507788161994, "grad_norm": 0.592833936214447, "kl": 0.05099848657846451, "learning_rate": 1.61375e-06, "loss": 0.0172, "num_tokens": 81912435.0, "reward": 1.4125304222106934, "reward_std": 0.08528114855289459, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.424435019493103, "rewards/correct_reward_func/std": 0.11915619671344757, "step": 629 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2346.0, "completions/mean_length": 1624.5238037109375, "completions/mean_terminated_length": 1545.3975830078125, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "epoch": 0.9813084112149533, "grad_norm": 0.5546726584434509, "kl": 0.05285438522696495, "learning_rate": 1.613125e-06, "loss": 0.0583, "num_tokens": 82054865.0, "reward": 1.5317888259887695, "reward_std": 0.12638387084007263, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5436934232711792, "rewards/correct_reward_func/std": 0.1404784917831421, "step": 630 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3120.0, "completions/max_terminated_length": 3120.0, "completions/mean_length": 1527.8095703125, "completions/mean_terminated_length": 1527.8095703125, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "epoch": 0.9828660436137072, "grad_norm": 0.5793138146400452, "kl": 0.052825529128313065, "learning_rate": 1.6125e-06, "loss": 0.0209, "num_tokens": 82189249.0, "reward": 1.470069408416748, "reward_std": 0.07235594838857651, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4700692892074585, "rewards/correct_reward_func/std": 0.13610175251960754, "step": 631 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3391.0, "completions/mean_length": 1610.011962890625, "completions/mean_terminated_length": 1530.7108154296875, "completions/min_length": 843.0, "completions/min_terminated_length": 843.0, "epoch": 0.9844236760124611, "grad_norm": 0.5624516606330872, "kl": 0.0496527124196291, "learning_rate": 1.611875e-06, "loss": 0.0426, "num_tokens": 82330616.0, "reward": 1.531969428062439, "reward_std": 0.10485806316137314, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.543874204158783, "rewards/correct_reward_func/std": 0.16279828548431396, "step": 632 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2136.0, "completions/max_terminated_length": 2136.0, "completions/mean_length": 1469.1190185546875, "completions/mean_terminated_length": 1469.1190185546875, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "epoch": 0.985981308411215, "grad_norm": 0.5987145304679871, "kl": 0.052102504298090935, "learning_rate": 1.61125e-06, "loss": 0.019, "num_tokens": 82459950.0, "reward": 1.5328919887542725, "reward_std": 0.08720895648002625, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5447967648506165, "rewards/correct_reward_func/std": 0.17077518999576569, "step": 633 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2012.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 1462.15478515625, "completions/mean_terminated_length": 1462.15478515625, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "epoch": 0.9875389408099688, "grad_norm": 0.5946549773216248, "kl": 0.05218057334423065, "learning_rate": 1.610625e-06, "loss": -0.0021, "num_tokens": 82588483.0, "reward": 1.4602668285369873, "reward_std": 0.06267713755369186, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46026673913002014, "rewards/correct_reward_func/std": 0.11811360716819763, "step": 634 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2227.0, "completions/max_terminated_length": 2227.0, "completions/mean_length": 1475.15478515625, "completions/mean_terminated_length": 1475.15478515625, "completions/min_length": 870.0, "completions/min_terminated_length": 870.0, "epoch": 0.9890965732087228, "grad_norm": 0.6137917041778564, "kl": 0.05290712043642998, "learning_rate": 1.61e-06, "loss": 0.0123, "num_tokens": 82718324.0, "reward": 1.4867254495620728, "reward_std": 0.060699447989463806, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.486725389957428, "rewards/correct_reward_func/std": 0.15401968359947205, "step": 635 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 3418.0, "completions/mean_length": 1539.71435546875, "completions/mean_terminated_length": 1459.566162109375, "completions/min_length": 876.0, "completions/min_terminated_length": 876.0, "epoch": 0.9906542056074766, "grad_norm": 0.5406956672668457, "kl": 0.05111967213451862, "learning_rate": 1.609375e-06, "loss": 0.0556, "num_tokens": 82853534.0, "reward": 1.5321414470672607, "reward_std": 0.08517434448003769, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.532141387462616, "rewards/correct_reward_func/std": 0.1582731455564499, "step": 636 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2517.0, "completions/max_terminated_length": 2517.0, "completions/mean_length": 1444.59521484375, "completions/mean_terminated_length": 1444.59521484375, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "epoch": 0.9922118380062306, "grad_norm": 0.5991398096084595, "kl": 0.04915725626051426, "learning_rate": 1.6087499999999998e-06, "loss": 0.0246, "num_tokens": 82980928.0, "reward": 1.5238535404205322, "reward_std": 0.04892566055059433, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5238535404205322, "rewards/correct_reward_func/std": 0.12992213666439056, "step": 637 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2228.0, "completions/max_terminated_length": 2228.0, "completions/mean_length": 1405.952392578125, "completions/mean_terminated_length": 1405.952392578125, "completions/min_length": 444.0, "completions/min_terminated_length": 444.0, "epoch": 0.9937694704049844, "grad_norm": 0.6216338872909546, "kl": 0.052867574617266655, "learning_rate": 1.608125e-06, "loss": 0.0101, "num_tokens": 83104848.0, "reward": 1.5133877992630005, "reward_std": 0.07994896173477173, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5133876204490662, "rewards/correct_reward_func/std": 0.1903848499059677, "step": 638 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 1475.5833740234375, "completions/mean_terminated_length": 1475.5833740234375, "completions/min_length": 629.0, "completions/min_terminated_length": 629.0, "epoch": 0.9953271028037384, "grad_norm": 0.5925660133361816, "kl": 0.05206291750073433, "learning_rate": 1.6074999999999999e-06, "loss": 0.0255, "num_tokens": 83234737.0, "reward": 1.497523307800293, "reward_std": 0.0626191571354866, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4975232183933258, "rewards/correct_reward_func/std": 0.18619437515735626, "step": 639 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5409.0, "completions/max_terminated_length": 5409.0, "completions/mean_length": 1527.107177734375, "completions/mean_terminated_length": 1527.107177734375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.9968847352024922, "grad_norm": 0.6247422695159912, "kl": 0.04974444583058357, "learning_rate": 1.606875e-06, "loss": -0.0138, "num_tokens": 83369086.0, "reward": 1.4803240299224854, "reward_std": 0.07588109374046326, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4922286570072174, "rewards/correct_reward_func/std": 0.15059354901313782, "step": 640 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2251.0, "completions/max_terminated_length": 2251.0, "completions/mean_length": 1502.4761962890625, "completions/mean_terminated_length": 1502.4761962890625, "completions/min_length": 786.0, "completions/min_terminated_length": 786.0, "epoch": 0.9984423676012462, "grad_norm": 0.5777775645256042, "kl": 0.050743360072374344, "learning_rate": 1.6062499999999999e-06, "loss": -0.0207, "num_tokens": 83501378.0, "reward": 1.5227001905441284, "reward_std": 0.05805504322052002, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5227001309394836, "rewards/correct_reward_func/std": 0.12714529037475586, "step": 641 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2487.0, "completions/max_terminated_length": 2487.0, "completions/mean_length": 1441.857177734375, "completions/mean_terminated_length": 1441.857177734375, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 1.0, "grad_norm": 0.6082573533058167, "kl": 0.05472211726009846, "learning_rate": 1.605625e-06, "loss": 0.0021, "num_tokens": 83628314.0, "reward": 1.4580137729644775, "reward_std": 0.11369025707244873, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4818231463432312, "rewards/correct_reward_func/std": 0.1592247188091278, "step": 642 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 1665.71435546875, "completions/mean_terminated_length": 1587.084228515625, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "epoch": 1.0015576323987538, "grad_norm": 0.5756648778915405, "kl": 0.05213580280542374, "learning_rate": 1.6049999999999999e-06, "loss": 0.0548, "num_tokens": 83774564.0, "reward": 1.4815843105316162, "reward_std": 0.07832953333854675, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48158419132232666, "rewards/correct_reward_func/std": 0.12412244081497192, "step": 643 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2377.0, "completions/max_terminated_length": 2377.0, "completions/mean_length": 1476.797607421875, "completions/mean_terminated_length": 1476.797607421875, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "epoch": 1.0031152647975077, "grad_norm": 0.594108521938324, "kl": 0.05231509543955326, "learning_rate": 1.604375e-06, "loss": 0.0001, "num_tokens": 83904741.0, "reward": 1.5084398984909058, "reward_std": 0.08493813127279282, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5203444957733154, "rewards/correct_reward_func/std": 0.11909134685993195, "step": 644 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2452.0, "completions/max_terminated_length": 2452.0, "completions/mean_length": 1510.357177734375, "completions/mean_terminated_length": 1510.357177734375, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "epoch": 1.0046728971962617, "grad_norm": 0.6033718585968018, "kl": 0.05159814655780792, "learning_rate": 1.6037499999999999e-06, "loss": -0.0043, "num_tokens": 84037497.0, "reward": 1.5596506595611572, "reward_std": 0.06282084435224533, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5596506595611572, "rewards/correct_reward_func/std": 0.18580923974514008, "step": 645 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2500.0, "completions/max_terminated_length": 2500.0, "completions/mean_length": 1567.84521484375, "completions/mean_terminated_length": 1567.84521484375, "completions/min_length": 1075.0, "completions/min_terminated_length": 1075.0, "epoch": 1.0062305295950156, "grad_norm": 0.5789066553115845, "kl": 0.052623504772782326, "learning_rate": 1.6031249999999998e-06, "loss": 0.0338, "num_tokens": 84175148.0, "reward": 1.5069093704223633, "reward_std": 0.08146540820598602, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5188140869140625, "rewards/correct_reward_func/std": 0.13371284306049347, "step": 646 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2175.0, "completions/max_terminated_length": 2175.0, "completions/mean_length": 1484.5, "completions/mean_terminated_length": 1484.5, "completions/min_length": 798.0, "completions/min_terminated_length": 798.0, "epoch": 1.0077881619937694, "grad_norm": 0.5949665307998657, "kl": 0.05212471820414066, "learning_rate": 1.6025e-06, "loss": 0.0168, "num_tokens": 84305870.0, "reward": 1.5230427980422974, "reward_std": 0.049363430589437485, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5230426788330078, "rewards/correct_reward_func/std": 0.12559856474399567, "step": 647 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.023809523809523836, "completions/max_length": 8192.0, "completions/max_terminated_length": 3188.0, "completions/mean_length": 1677.916748046875, "completions/mean_terminated_length": 1519.0364990234375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 1.0093457943925233, "grad_norm": 0.5194585919380188, "kl": 0.04932490363717079, "learning_rate": 1.6018749999999998e-06, "loss": 0.0928, "num_tokens": 84452887.0, "reward": 1.4853767156600952, "reward_std": 0.08370485156774521, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4853765368461609, "rewards/correct_reward_func/std": 0.16985315084457397, "step": 648 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2393.0, "completions/max_terminated_length": 2393.0, "completions/mean_length": 1499.5714111328125, "completions/mean_terminated_length": 1499.5714111328125, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "epoch": 1.0109034267912773, "grad_norm": 0.6075387597084045, "kl": 0.05233858525753021, "learning_rate": 1.60125e-06, "loss": 0.0018, "num_tokens": 84585019.0, "reward": 1.551127314567566, "reward_std": 0.05537908524274826, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5511272549629211, "rewards/correct_reward_func/std": 0.14352430403232574, "step": 649 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2385.0, "completions/max_terminated_length": 2385.0, "completions/mean_length": 1476.90478515625, "completions/mean_terminated_length": 1476.90478515625, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "epoch": 1.0124610591900312, "grad_norm": 0.5924170017242432, "kl": 0.05188839137554169, "learning_rate": 1.6006249999999998e-06, "loss": 0.0245, "num_tokens": 84714923.0, "reward": 1.5396331548690796, "reward_std": 0.06312854588031769, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5396330952644348, "rewards/correct_reward_func/std": 0.1734156459569931, "step": 650 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2618.0, "completions/max_terminated_length": 2618.0, "completions/mean_length": 1465.4881591796875, "completions/mean_terminated_length": 1465.4881591796875, "completions/min_length": 618.0, "completions/min_terminated_length": 618.0, "epoch": 1.014018691588785, "grad_norm": 0.597648024559021, "kl": 0.05065512843430042, "learning_rate": 1.6e-06, "loss": 0.0192, "num_tokens": 84844012.0, "reward": 1.5225285291671753, "reward_std": 0.05072065815329552, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5225285291671753, "rewards/correct_reward_func/std": 0.17245061695575714, "step": 651 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2194.0, "completions/max_terminated_length": 2194.0, "completions/mean_length": 1471.21435546875, "completions/mean_terminated_length": 1471.21435546875, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 1.0155763239875388, "grad_norm": 0.5615136623382568, "kl": 0.05269411765038967, "learning_rate": 1.5993749999999998e-06, "loss": -0.0099, "num_tokens": 84973576.0, "reward": 1.488713026046753, "reward_std": 0.05458061024546623, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48871293663978577, "rewards/correct_reward_func/std": 0.1673632711172104, "step": 652 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3245.0, "completions/max_terminated_length": 3245.0, "completions/mean_length": 1455.2738037109375, "completions/mean_terminated_length": 1455.2738037109375, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "epoch": 1.017133956386293, "grad_norm": 0.5472370386123657, "kl": 0.05032069608569145, "learning_rate": 1.5987499999999997e-06, "loss": -0.0272, "num_tokens": 85101609.0, "reward": 1.532504916191101, "reward_std": 0.04785650223493576, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5325048565864563, "rewards/correct_reward_func/std": 0.19464264810085297, "step": 653 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2291.0, "completions/max_terminated_length": 2291.0, "completions/mean_length": 1474.0714111328125, "completions/mean_terminated_length": 1474.0714111328125, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 1.0186915887850467, "grad_norm": 0.5767446756362915, "kl": 0.05521121807396412, "learning_rate": 1.5981249999999998e-06, "loss": -0.0144, "num_tokens": 85231431.0, "reward": 1.4579790830612183, "reward_std": 0.09587103873491287, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.46988385915756226, "rewards/correct_reward_func/std": 0.13316239416599274, "step": 654 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2599.0, "completions/max_terminated_length": 2599.0, "completions/mean_length": 1522.8333740234375, "completions/mean_terminated_length": 1522.8333740234375, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "epoch": 1.0202492211838006, "grad_norm": 0.5800827741622925, "kl": 0.05396328121423721, "learning_rate": 1.5975e-06, "loss": 0.0364, "num_tokens": 85365211.0, "reward": 1.501022219657898, "reward_std": 0.076015904545784, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5010221600532532, "rewards/correct_reward_func/std": 0.15128661692142487, "step": 655 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2184.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 1491.1429443359375, "completions/mean_terminated_length": 1491.1429443359375, "completions/min_length": 734.0, "completions/min_terminated_length": 734.0, "epoch": 1.0218068535825544, "grad_norm": 0.5550655126571655, "kl": 0.05126112699508667, "learning_rate": 1.596875e-06, "loss": 0.0014, "num_tokens": 85496383.0, "reward": 1.4312056303024292, "reward_std": 0.08455885946750641, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.44311028718948364, "rewards/correct_reward_func/std": 0.12594355642795563, "step": 656 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2711.0, "completions/max_terminated_length": 2711.0, "completions/mean_length": 1563.452392578125, "completions/mean_terminated_length": 1563.452392578125, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "epoch": 1.0233644859813085, "grad_norm": 0.5272036194801331, "kl": 0.05299381539225578, "learning_rate": 1.59625e-06, "loss": -0.0094, "num_tokens": 85633941.0, "reward": 1.4453727006912231, "reward_std": 0.060865722596645355, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.44537264108657837, "rewards/correct_reward_func/std": 0.12575693428516388, "step": 657 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2520.0, "completions/max_terminated_length": 2520.0, "completions/mean_length": 1494.107177734375, "completions/mean_terminated_length": 1494.107177734375, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "epoch": 1.0249221183800623, "grad_norm": 0.629683792591095, "kl": 0.053053101524710655, "learning_rate": 1.595625e-06, "loss": -0.0115, "num_tokens": 85765350.0, "reward": 1.4953358173370361, "reward_std": 0.06038458272814751, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.49533572793006897, "rewards/correct_reward_func/std": 0.1519479900598526, "step": 658 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2727.0, "completions/max_terminated_length": 2727.0, "completions/mean_length": 1514.0833740234375, "completions/mean_terminated_length": 1514.0833740234375, "completions/min_length": 781.0, "completions/min_terminated_length": 781.0, "epoch": 1.0264797507788161, "grad_norm": 0.5846103429794312, "kl": 0.05215497314929962, "learning_rate": 1.595e-06, "loss": -0.0266, "num_tokens": 85898593.0, "reward": 1.5464462041854858, "reward_std": 0.07301543653011322, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5464460253715515, "rewards/correct_reward_func/std": 0.11028709262609482, "step": 659 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2479.0, "completions/max_terminated_length": 2479.0, "completions/mean_length": 1448.4405517578125, "completions/mean_terminated_length": 1448.4405517578125, "completions/min_length": 602.0, "completions/min_terminated_length": 602.0, "epoch": 1.02803738317757, "grad_norm": 0.6190741658210754, "kl": 0.052393680438399315, "learning_rate": 1.594375e-06, "loss": -0.0017, "num_tokens": 86026226.0, "reward": 1.4759933948516846, "reward_std": 0.04937182739377022, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47599339485168457, "rewards/correct_reward_func/std": 0.13999196887016296, "step": 660 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2861.0, "completions/max_terminated_length": 2861.0, "completions/mean_length": 1479.8809814453125, "completions/mean_terminated_length": 1479.8809814453125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 1.029595015576324, "grad_norm": 0.5660926699638367, "kl": 0.05401911213994026, "learning_rate": 1.59375e-06, "loss": -0.0151, "num_tokens": 86156482.0, "reward": 1.4849108457565308, "reward_std": 0.07375740259885788, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.48491087555885315, "rewards/correct_reward_func/std": 0.17290189862251282, "step": 661 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4504.0, "completions/max_terminated_length": 4504.0, "completions/mean_length": 1467.297607421875, "completions/mean_terminated_length": 1467.297607421875, "completions/min_length": 710.0, "completions/min_terminated_length": 710.0, "epoch": 1.0311526479750779, "grad_norm": 0.6604934930801392, "kl": 0.054234541952610016, "learning_rate": 1.5931249999999999e-06, "loss": 0.0182, "num_tokens": 86285585.0, "reward": 1.4347630739212036, "reward_std": 0.0708894431591034, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4347630441188812, "rewards/correct_reward_func/std": 0.1494257152080536, "step": 662 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2484.0, "completions/max_terminated_length": 2484.0, "completions/mean_length": 1498.416748046875, "completions/mean_terminated_length": 1498.416748046875, "completions/min_length": 363.0, "completions/min_terminated_length": 363.0, "epoch": 1.0327102803738317, "grad_norm": 0.5682238936424255, "kl": 0.05228089354932308, "learning_rate": 1.5925e-06, "loss": -0.0695, "num_tokens": 86417554.0, "reward": 1.4704004526138306, "reward_std": 0.09661635756492615, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4704003930091858, "rewards/correct_reward_func/std": 0.16588261723518372, "step": 663 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 1710.15478515625, "completions/mean_terminated_length": 1632.0601806640625, "completions/min_length": 1021.0, "completions/min_terminated_length": 1021.0, "epoch": 1.0342679127725856, "grad_norm": 0.565608024597168, "kl": 0.05014815367758274, "learning_rate": 1.591875e-06, "loss": 0.0822, "num_tokens": 86567447.0, "reward": 1.535957932472229, "reward_std": 0.08144499361515045, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5359576344490051, "rewards/correct_reward_func/std": 0.1807517409324646, "step": 664 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2085.0, "completions/max_terminated_length": 2085.0, "completions/mean_length": 1510.21435546875, "completions/mean_terminated_length": 1510.21435546875, "completions/min_length": 827.0, "completions/min_terminated_length": 827.0, "epoch": 1.0358255451713396, "grad_norm": 0.5581820607185364, "kl": 0.0530538372695446, "learning_rate": 1.59125e-06, "loss": -0.0007, "num_tokens": 86700389.0, "reward": 1.5879935026168823, "reward_std": 0.06466535478830338, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5879934430122375, "rewards/correct_reward_func/std": 0.17678600549697876, "step": 665 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2300.0, "completions/mean_length": 1603.9881591796875, "completions/mean_terminated_length": 1524.6143798828125, "completions/min_length": 899.0, "completions/min_terminated_length": 899.0, "epoch": 1.0373831775700935, "grad_norm": 0.5471597909927368, "kl": 0.0508502759039402, "learning_rate": 1.590625e-06, "loss": 0.0707, "num_tokens": 86841100.0, "reward": 1.536754846572876, "reward_std": 0.0733090415596962, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5367547273635864, "rewards/correct_reward_func/std": 0.13262318074703217, "step": 666 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2463.0, "completions/max_terminated_length": 2463.0, "completions/mean_length": 1527.5833740234375, "completions/mean_terminated_length": 1527.5833740234375, "completions/min_length": 982.0, "completions/min_terminated_length": 982.0, "epoch": 1.0389408099688473, "grad_norm": 0.5596882104873657, "kl": 0.05326198227703571, "learning_rate": 1.59e-06, "loss": -0.0095, "num_tokens": 86975351.0, "reward": 1.5027247667312622, "reward_std": 0.06251788139343262, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5027247071266174, "rewards/correct_reward_func/std": 0.15841983258724213, "step": 667 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2329.0, "completions/max_terminated_length": 2329.0, "completions/mean_length": 1483.297607421875, "completions/mean_terminated_length": 1483.297607421875, "completions/min_length": 707.0, "completions/min_terminated_length": 707.0, "epoch": 1.0404984423676011, "grad_norm": 0.5668331384658813, "kl": 0.05328033119440079, "learning_rate": 1.589375e-06, "loss": 0.0127, "num_tokens": 87105834.0, "reward": 1.5575391054153442, "reward_std": 0.09122274816036224, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5575389862060547, "rewards/correct_reward_func/std": 0.16401030123233795, "step": 668 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2943.0, "completions/max_terminated_length": 2943.0, "completions/mean_length": 1532.452392578125, "completions/mean_terminated_length": 1532.452392578125, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "epoch": 1.0420560747663552, "grad_norm": 0.5774728655815125, "kl": 0.0538950152695179, "learning_rate": 1.58875e-06, "loss": -0.0052, "num_tokens": 87240410.0, "reward": 1.4293346405029297, "reward_std": 0.045015521347522736, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4293345808982849, "rewards/correct_reward_func/std": 0.10337743908166885, "step": 669 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2548.0, "completions/max_terminated_length": 2548.0, "completions/mean_length": 1630.46435546875, "completions/mean_terminated_length": 1630.46435546875, "completions/min_length": 934.0, "completions/min_terminated_length": 934.0, "epoch": 1.043613707165109, "grad_norm": 0.5443136692047119, "kl": 0.05456646718084812, "learning_rate": 1.588125e-06, "loss": 0.0483, "num_tokens": 87383351.0, "reward": 1.4322932958602905, "reward_std": 0.10355141013860703, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.45610272884368896, "rewards/correct_reward_func/std": 0.14299173653125763, "step": 670 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2304.0, "completions/mean_length": 1638.0714111328125, "completions/mean_terminated_length": 1559.1083984375, "completions/min_length": 924.0, "completions/min_terminated_length": 924.0, "epoch": 1.0451713395638629, "grad_norm": 0.5622561573982239, "kl": 0.05328943021595478, "learning_rate": 1.5874999999999998e-06, "loss": 0.0366, "num_tokens": 87527021.0, "reward": 1.4649417400360107, "reward_std": 0.10010730475187302, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4768464267253876, "rewards/correct_reward_func/std": 0.13820233941078186, "step": 671 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2270.0, "completions/max_terminated_length": 2270.0, "completions/mean_length": 1578.40478515625, "completions/mean_terminated_length": 1578.40478515625, "completions/min_length": 988.0, "completions/min_terminated_length": 988.0, "epoch": 1.0467289719626167, "grad_norm": 0.5663480758666992, "kl": 0.055856646969914436, "learning_rate": 1.586875e-06, "loss": -0.0032, "num_tokens": 87665691.0, "reward": 1.511879324913025, "reward_std": 0.047249529510736465, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5118792653083801, "rewards/correct_reward_func/std": 0.18857014179229736, "step": 672 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2368.0, "completions/max_terminated_length": 2368.0, "completions/mean_length": 1583.3095703125, "completions/mean_terminated_length": 1583.3095703125, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "epoch": 1.0482866043613708, "grad_norm": 0.5549023747444153, "kl": 0.05541318096220493, "learning_rate": 1.5862499999999998e-06, "loss": -0.0078, "num_tokens": 87804701.0, "reward": 1.4863669872283936, "reward_std": 0.08935274183750153, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.4982716739177704, "rewards/correct_reward_func/std": 0.11579611152410507, "step": 673 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2302.0, "completions/max_terminated_length": 2302.0, "completions/mean_length": 1576.75, "completions/mean_terminated_length": 1576.75, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "epoch": 1.0498442367601246, "grad_norm": 0.5554832220077515, "kl": 0.056341828778386116, "learning_rate": 1.585625e-06, "loss": 0.0043, "num_tokens": 87943214.0, "reward": 1.4435389041900635, "reward_std": 0.06262954324483871, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4435388445854187, "rewards/correct_reward_func/std": 0.1332552284002304, "step": 674 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2357.0, "completions/max_terminated_length": 2357.0, "completions/mean_length": 1566.166748046875, "completions/mean_terminated_length": 1566.166748046875, "completions/min_length": 881.0, "completions/min_terminated_length": 881.0, "epoch": 1.0514018691588785, "grad_norm": 0.5815324187278748, "kl": 0.05804356001317501, "learning_rate": 1.5849999999999999e-06, "loss": 0.0252, "num_tokens": 88080898.0, "reward": 1.5119918584823608, "reward_std": 0.07926620543003082, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5238964557647705, "rewards/correct_reward_func/std": 0.1911584585905075, "step": 675 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2573.0, "completions/max_terminated_length": 2573.0, "completions/mean_length": 1587.4405517578125, "completions/mean_terminated_length": 1587.4405517578125, "completions/min_length": 994.0, "completions/min_terminated_length": 994.0, "epoch": 1.0529595015576323, "grad_norm": 0.5776386857032776, "kl": 0.055761074647307396, "learning_rate": 1.584375e-06, "loss": 0.0302, "num_tokens": 88220291.0, "reward": 1.4661586284637451, "reward_std": 0.05945530906319618, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.46615859866142273, "rewards/correct_reward_func/std": 0.11711110919713974, "step": 676 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3017.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 1575.71435546875, "completions/mean_terminated_length": 1575.71435546875, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "epoch": 1.0545171339563864, "grad_norm": 0.5745882987976074, "kl": 0.053962018340826035, "learning_rate": 1.5837499999999999e-06, "loss": 0.0093, "num_tokens": 88358657.0, "reward": 1.5064351558685303, "reward_std": 0.06469320505857468, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5064350962638855, "rewards/correct_reward_func/std": 0.15094655752182007, "step": 677 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2058.0, "completions/max_terminated_length": 2058.0, "completions/mean_length": 1459.84521484375, "completions/mean_terminated_length": 1459.84521484375, "completions/min_length": 728.0, "completions/min_terminated_length": 728.0, "epoch": 1.0560747663551402, "grad_norm": 0.5632085800170898, "kl": 0.0573277622461319, "learning_rate": 1.5831249999999998e-06, "loss": -0.0196, "num_tokens": 88487278.0, "reward": 1.5942164659500122, "reward_std": 0.08631883561611176, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.6061212420463562, "rewards/correct_reward_func/std": 0.1588619351387024, "step": 678 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2402.0, "completions/max_terminated_length": 2402.0, "completions/mean_length": 1549.75, "completions/mean_terminated_length": 1549.75, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "epoch": 1.057632398753894, "grad_norm": 0.5981232523918152, "kl": 0.05635823681950569, "learning_rate": 1.5824999999999999e-06, "loss": 0.0156, "num_tokens": 88623529.0, "reward": 1.4801684617996216, "reward_std": 0.04865288734436035, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4801684021949768, "rewards/correct_reward_func/std": 0.14745546877384186, "step": 679 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2477.0, "completions/mean_length": 1631.0714111328125, "completions/mean_terminated_length": 1552.0240478515625, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "epoch": 1.0591900311526479, "grad_norm": 0.5360672473907471, "kl": 0.05281771533191204, "learning_rate": 1.5818749999999998e-06, "loss": 0.0423, "num_tokens": 88766485.0, "reward": 1.507429599761963, "reward_std": 0.05878061428666115, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5074294805526733, "rewards/correct_reward_func/std": 0.1723850518465042, "step": 680 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2320.0, "completions/max_terminated_length": 2320.0, "completions/mean_length": 1525.7381591796875, "completions/mean_terminated_length": 1525.7381591796875, "completions/min_length": 847.0, "completions/min_terminated_length": 847.0, "epoch": 1.060747663551402, "grad_norm": 0.5318810939788818, "kl": 0.055077340453863144, "learning_rate": 1.58125e-06, "loss": -0.0029, "num_tokens": 88900719.0, "reward": 1.5089659690856934, "reward_std": 0.060308195650577545, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5089658498764038, "rewards/correct_reward_func/std": 0.17798349261283875, "step": 681 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2428.0, "completions/max_terminated_length": 2428.0, "completions/mean_length": 1492.8214111328125, "completions/mean_terminated_length": 1492.8214111328125, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "epoch": 1.0623052959501558, "grad_norm": 0.6031827330589294, "kl": 0.05352173000574112, "learning_rate": 1.5806249999999998e-06, "loss": -0.0015, "num_tokens": 89031852.0, "reward": 1.4641478061676025, "reward_std": 0.06496328860521317, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4641478359699249, "rewards/correct_reward_func/std": 0.1625915914773941, "step": 682 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2383.0, "completions/mean_length": 1627.09521484375, "completions/mean_terminated_length": 1548.0, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "epoch": 1.0638629283489096, "grad_norm": 0.5318005084991455, "kl": 0.05356441251933575, "learning_rate": 1.58e-06, "loss": 0.0701, "num_tokens": 89174522.0, "reward": 1.5102105140686035, "reward_std": 0.09376493096351624, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.5221152305603027, "rewards/correct_reward_func/std": 0.16014137864112854, "step": 683 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2362.0, "completions/mean_length": 1641.2381591796875, "completions/mean_terminated_length": 1562.313232421875, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "epoch": 1.0654205607476634, "grad_norm": 0.5863988995552063, "kl": 0.055096494033932686, "learning_rate": 1.5793749999999998e-06, "loss": 0.0739, "num_tokens": 89318332.0, "reward": 1.4779866933822632, "reward_std": 0.0922832116484642, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.48989132046699524, "rewards/correct_reward_func/std": 0.1688835322856903, "step": 684 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2260.0, "completions/max_terminated_length": 2260.0, "completions/mean_length": 1586.9405517578125, "completions/mean_terminated_length": 1586.9405517578125, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "epoch": 1.0669781931464175, "grad_norm": 0.5507916212081909, "kl": 0.05459226667881012, "learning_rate": 1.5787500000000001e-06, "loss": -0.0182, "num_tokens": 89457557.0, "reward": 1.543197512626648, "reward_std": 0.06276614218950272, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5431973338127136, "rewards/correct_reward_func/std": 0.18692582845687866, "step": 685 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4759.0, "completions/max_terminated_length": 4759.0, "completions/mean_length": 1548.416748046875, "completions/mean_terminated_length": 1548.416748046875, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "epoch": 1.0685358255451713, "grad_norm": 0.5588350296020508, "kl": 0.05437791533768177, "learning_rate": 1.578125e-06, "loss": -0.0017, "num_tokens": 89593504.0, "reward": 1.4723842144012451, "reward_std": 0.06399713456630707, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4723840355873108, "rewards/correct_reward_func/std": 0.15390437841415405, "step": 686 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3083.0, "completions/max_terminated_length": 3083.0, "completions/mean_length": 1471.7738037109375, "completions/mean_terminated_length": 1471.7738037109375, "completions/min_length": 437.0, "completions/min_terminated_length": 437.0, "epoch": 1.0700934579439252, "grad_norm": 0.6055949926376343, "kl": 0.05486376769840717, "learning_rate": 1.5775e-06, "loss": -0.0051, "num_tokens": 89723055.0, "reward": 1.460752010345459, "reward_std": 0.09401778876781464, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.472656786441803, "rewards/correct_reward_func/std": 0.18991245329380035, "step": 687 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2914.0, "completions/max_terminated_length": 2914.0, "completions/mean_length": 1501.357177734375, "completions/mean_terminated_length": 1501.357177734375, "completions/min_length": 852.0, "completions/min_terminated_length": 852.0, "epoch": 1.071651090342679, "grad_norm": 0.6138256788253784, "kl": 0.0549286063760519, "learning_rate": 1.576875e-06, "loss": 0.0095, "num_tokens": 89855031.0, "reward": 1.452558994293213, "reward_std": 0.062143485993146896, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.45255884528160095, "rewards/correct_reward_func/std": 0.14738577604293823, "step": 688 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2026.0, "completions/max_terminated_length": 2026.0, "completions/mean_length": 1508.857177734375, "completions/mean_terminated_length": 1508.857177734375, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "epoch": 1.073208722741433, "grad_norm": 0.5856159329414368, "kl": 0.05561050772666931, "learning_rate": 1.57625e-06, "loss": 0.0011, "num_tokens": 89987721.0, "reward": 1.4332712888717651, "reward_std": 0.04271453246474266, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4332713186740875, "rewards/correct_reward_func/std": 0.11550971120595932, "step": 689 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2183.0, "completions/max_terminated_length": 2183.0, "completions/mean_length": 1527.1785888671875, "completions/mean_terminated_length": 1527.1785888671875, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "epoch": 1.074766355140187, "grad_norm": 0.6025081276893616, "kl": 0.05726106837391853, "learning_rate": 1.575625e-06, "loss": 0.0185, "num_tokens": 90121794.0, "reward": 1.4257436990737915, "reward_std": 0.09166575968265533, "rewards/contains_chinese/mean": 0.988095223903656, "rewards/contains_chinese/std": 0.10910894721746445, "rewards/correct_reward_func/mean": 0.43764835596084595, "rewards/correct_reward_func/std": 0.14188778400421143, "step": 690 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2313.0, "completions/max_terminated_length": 2313.0, "completions/mean_length": 1449.4761962890625, "completions/mean_terminated_length": 1449.4761962890625, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 1.0763239875389408, "grad_norm": 0.6137524843215942, "kl": 0.059694841504096985, "learning_rate": 1.575e-06, "loss": -0.0008, "num_tokens": 90249376.0, "reward": 1.5255221128463745, "reward_std": 0.05913606286048889, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5255220532417297, "rewards/correct_reward_func/std": 0.1678466647863388, "step": 691 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2680.0, "completions/max_terminated_length": 2680.0, "completions/mean_length": 1516.2261962890625, "completions/mean_terminated_length": 1516.2261962890625, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "epoch": 1.0778816199376946, "grad_norm": 0.5627336502075195, "kl": 0.052463850006461143, "learning_rate": 1.574375e-06, "loss": 0.0015, "num_tokens": 90382571.0, "reward": 1.518011450767517, "reward_std": 0.06652691215276718, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5180113911628723, "rewards/correct_reward_func/std": 0.13873633742332458, "step": 692 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.011904761904761862, "completions/max_length": 8192.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 1571.6429443359375, "completions/mean_terminated_length": 1491.8795166015625, "completions/min_length": 993.0, "completions/min_terminated_length": 993.0, "epoch": 1.0794392523364487, "grad_norm": 0.5776975154876709, "kl": 0.05454135872423649, "learning_rate": 1.57375e-06, "loss": 0.0496, "num_tokens": 90520505.0, "reward": 1.4717916250228882, "reward_std": 0.09959909319877625, "rewards/contains_chinese/mean": 0.976190447807312, "rewards/contains_chinese/std": 0.15337100625038147, "rewards/correct_reward_func/mean": 0.4956010580062866, "rewards/correct_reward_func/std": 0.12597030401229858, "step": 693 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1938.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 1448.7381591796875, "completions/mean_terminated_length": 1448.7381591796875, "completions/min_length": 989.0, "completions/min_terminated_length": 989.0, "epoch": 1.0809968847352025, "grad_norm": 0.6719677448272705, "kl": 0.054130397737026215, "learning_rate": 1.573125e-06, "loss": 0.0117, "num_tokens": 90648091.0, "reward": 1.4858334064483643, "reward_std": 0.048804186284542084, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4858333170413971, "rewards/correct_reward_func/std": 0.12014701217412949, "step": 694 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2097.0, "completions/max_terminated_length": 2097.0, "completions/mean_length": 1482.857177734375, "completions/mean_terminated_length": 1482.857177734375, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 1.0825545171339563, "grad_norm": 0.605505645275116, "kl": 0.05565035529434681, "learning_rate": 1.5725e-06, "loss": 0.0116, "num_tokens": 90778639.0, "reward": 1.4878506660461426, "reward_std": 0.05565100908279419, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4878506362438202, "rewards/correct_reward_func/std": 0.15627160668373108, "step": 695 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6738.0, "completions/max_terminated_length": 6738.0, "completions/mean_length": 1650.5, "completions/mean_terminated_length": 1650.5, "completions/min_length": 1163.0, "completions/min_terminated_length": 1163.0, "epoch": 1.0841121495327102, "grad_norm": 0.5941537618637085, "kl": 0.051503732800483704, "learning_rate": 1.5718749999999999e-06, "loss": -0.0238, "num_tokens": 90923377.0, "reward": 1.477596640586853, "reward_std": 0.056624628603458405, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.4775967299938202, "rewards/correct_reward_func/std": 0.1360875815153122, "step": 696 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2387.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 1535.0, "completions/mean_terminated_length": 1535.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 1.0856697819314642, "grad_norm": 0.5858432054519653, "kl": 0.053479718044400215, "learning_rate": 1.57125e-06, "loss": 0.0207, "num_tokens": 91058353.0, "reward": 1.4727427959442139, "reward_std": 0.040079839527606964, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.47274258732795715, "rewards/correct_reward_func/std": 0.10987861454486847, "step": 697 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2683.0, "completions/max_terminated_length": 2683.0, "completions/mean_length": 1514.5357666015625, "completions/mean_terminated_length": 1514.5357666015625, "completions/min_length": 1071.0, "completions/min_terminated_length": 1071.0, "epoch": 1.087227414330218, "grad_norm": 0.5789026618003845, "kl": 0.054706670343875885, "learning_rate": 1.5706249999999999e-06, "loss": 0.0078, "num_tokens": 91191598.0, "reward": 1.528199315071106, "reward_std": 0.13074904680252075, "rewards/contains_chinese/mean": 0.9642857313156128, "rewards/contains_chinese/std": 0.18669146299362183, "rewards/correct_reward_func/mean": 0.5639137029647827, "rewards/correct_reward_func/std": 0.17987313866615295, "step": 698 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2442.0, "completions/max_terminated_length": 2442.0, "completions/mean_length": 1555.3333740234375, "completions/mean_terminated_length": 1555.3333740234375, "completions/min_length": 1020.0, "completions/min_terminated_length": 1020.0, "epoch": 1.088785046728972, "grad_norm": 0.551451563835144, "kl": 0.05342511832714081, "learning_rate": 1.57e-06, "loss": -0.0102, "num_tokens": 91328390.0, "reward": 1.528260588645935, "reward_std": 0.05848051980137825, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5282606482505798, "rewards/correct_reward_func/std": 0.16042807698249817, "step": 699 }, { "clip_ratio": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2572.0, "completions/max_terminated_length": 2572.0, "completions/mean_length": 1527.8333740234375, "completions/mean_terminated_length": 1527.8333740234375, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "epoch": 1.0903426791277258, "grad_norm": 0.58552086353302, "kl": 0.05442274548113346, "learning_rate": 1.569375e-06, "loss": 0.029, "num_tokens": 91462740.0, "reward": 1.5053088665008545, "reward_std": 0.06954223662614822, "rewards/contains_chinese/mean": 1.0, "rewards/contains_chinese/std": 0.0, "rewards/correct_reward_func/mean": 0.5053088068962097, "rewards/correct_reward_func/std": 0.16963563859462738, "step": 700 } ], "logging_steps": 1.0, "max_steps": 3210, "num_input_tokens_seen": 91462740, "num_train_epochs": 5, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }